From 7c14b1b0035f0cbd3dad817199c5a622978e42b7 Mon Sep 17 00:00:00 2001
From: Jasmine <paladique@users.noreply.github.com>
Date: Thu, 23 Sep 2021 09:27:28 -0400
Subject: [PATCH] lesson assignment

---
 .gitignore                                    |  3 +
 .../14-Introduction/README.md                 |  6 +-
 .../14-Introduction/notebook.ipynb            | 96 ++-----------------
 .../15-analyzing/assignment.md                | 10 +-
 .../15-analyzing/notebook.ipynb               | 25 +++++
 5 files changed, 47 insertions(+), 93 deletions(-)

diff --git a/.gitignore b/.gitignore
index 908a848..abcb05e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -351,3 +351,6 @@ MigrationBackup/
 
 # Ionide (cross platform F# VS Code tools) working folder
 .ionide/
+4-Data-Science-Lifecycle/14-Introduction/README.md
+.vscode/settings.json
+Data/Taxi/*
diff --git a/4-Data-Science-Lifecycle/14-Introduction/README.md b/4-Data-Science-Lifecycle/14-Introduction/README.md
index f80ee3c..26b57b4 100644
--- a/4-Data-Science-Lifecycle/14-Introduction/README.md
+++ b/4-Data-Science-Lifecycle/14-Introduction/README.md
@@ -1,5 +1,9 @@
 # Introduction to the Data Science Lifecycle
 
+|![ Sketchnote by [(@sketchthedocs)](https://sketchthedocs.dev) ](../../sketchnotes/14-DataScience-Lifecycle.png)|
+|:---:|
+| Introduction to the Data Science Lifecycle - _Sketchnote by [@nitya](https://twitter.com/nitya)_ |
+
 ## Pre-Lecture Quiz
 
 [Pre-lecture quiz]()
@@ -101,4 +105,4 @@ Applying the Data Science Lifecycle involves multiple roles and tasks, where som
 
 ## Assignment
 
-[Assignment Title](assignment.md)
+[Exploring and Assessing a Dataset](assignment.md)
diff --git a/4-Data-Science-Lifecycle/14-Introduction/notebook.ipynb b/4-Data-Science-Lifecycle/14-Introduction/notebook.ipynb
index 42a3539..e28766c 100644
--- a/4-Data-Science-Lifecycle/14-Introduction/notebook.ipynb
+++ b/4-Data-Science-Lifecycle/14-Introduction/notebook.ipynb
@@ -12,14 +12,9 @@
     {
       "cell_type": "markdown",
       "source": [
-        "# Exploring NYC Taxi data in Winter and Summer"
-      ],
-      "metadata": {}
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "Install azureml-opendatasets package"
+        "# Exploring NYC Taxi data in Winter and Summer\r\n",
+        "\r\n",
+        "Refer to the [Data dictionary](https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf) to explore the columns that have been provided.\r\n"
       ],
       "metadata": {}
     },
@@ -36,91 +31,20 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 19,
+      "execution_count": null,
       "source": [
         "import pandas as pd\r\n",
         "import glob\r\n",
         "\r\n",
-        "# print(pd.read_csv('../../data/Taxi/yellow_tripdata_2019-01.csv'))\r\n",
-        "all_files = glob.glob('../../data/Taxi/*.csv')\r\n",
+        "path = '../../data/Taxi/yellow_tripdata_2019-{}.csv'\r\n",
+        "july_taxi = pd.read_csv(path.format('07'))\r\n",
+        "january_taxi = pd.read_csv(path.format('01'))\r\n",
+        "\r\n",
+        "df = pd.concat([january_taxi, july_taxi])\r\n",
         "\r\n",
-        "df = pd.concat((pd.read_csv(f) for f in all_files))\r\n",
         "print(df)"
       ],
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "         VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \\\n",
-            "0             1.0  2019-01-01 00:46:40   2019-01-01 00:53:20              1.0   \n",
-            "1             1.0  2019-01-01 00:59:47   2019-01-01 01:18:59              1.0   \n",
-            "2             2.0  2018-12-21 13:48:30   2018-12-21 13:52:40              3.0   \n",
-            "3             2.0  2018-11-28 15:52:25   2018-11-28 15:55:45              5.0   \n",
-            "4             2.0  2018-11-28 15:56:57   2018-11-28 15:58:33              5.0   \n",
-            "...           ...                  ...                   ...              ...   \n",
-            "6896312       NaN  2019-12-31 00:07:00   2019-12-31 00:46:00              NaN   \n",
-            "6896313       NaN  2019-12-31 00:20:00   2019-12-31 00:47:00              NaN   \n",
-            "6896314       NaN  2019-12-31 00:50:00   2019-12-31 01:21:00              NaN   \n",
-            "6896315       NaN  2019-12-31 00:38:19   2019-12-31 01:19:37              NaN   \n",
-            "6896316       NaN  2019-12-31 00:21:00   2019-12-31 00:56:00              NaN   \n",
-            "\n",
-            "         trip_distance  RatecodeID store_and_fwd_flag  PULocationID  \\\n",
-            "0                 1.50         1.0                  N           151   \n",
-            "1                 2.60         1.0                  N           239   \n",
-            "2                 0.00         1.0                  N           236   \n",
-            "3                 0.00         1.0                  N           193   \n",
-            "4                 0.00         2.0                  N           193   \n",
-            "...                ...         ...                ...           ...   \n",
-            "6896312          12.78         NaN                NaN           230   \n",
-            "6896313          18.52         NaN                NaN           219   \n",
-            "6896314          13.13         NaN                NaN           161   \n",
-            "6896315          14.51         NaN                NaN           230   \n",
-            "6896316         -17.16         NaN                NaN           193   \n",
-            "\n",
-            "         DOLocationID  payment_type  fare_amount  extra  mta_tax  tip_amount  \\\n",
-            "0                 239           1.0         7.00   0.50      0.5        1.65   \n",
-            "1                 246           1.0        14.00   0.50      0.5        1.00   \n",
-            "2                 236           1.0         4.50   0.50      0.5        0.00   \n",
-            "3                 193           2.0         3.50   0.50      0.5        0.00   \n",
-            "4                 193           2.0        52.00   0.00      0.5        0.00   \n",
-            "...               ...           ...          ...    ...      ...         ...   \n",
-            "6896312            72           NaN        32.32   2.75      0.5        0.00   \n",
-            "6896313            32           NaN        51.63   2.75      0.5        0.00   \n",
-            "6896314            76           NaN        38.02   2.75      0.5        0.00   \n",
-            "6896315            21           NaN        41.86   2.75      0.0        0.00   \n",
-            "6896316           219           NaN        44.62   2.75      0.5        0.00   \n",
-            "\n",
-            "         tolls_amount  improvement_surcharge  total_amount  \\\n",
-            "0                0.00                    0.3          9.95   \n",
-            "1                0.00                    0.3         16.30   \n",
-            "2                0.00                    0.3          5.80   \n",
-            "3                0.00                    0.3          7.55   \n",
-            "4                0.00                    0.3         55.55   \n",
-            "...               ...                    ...           ...   \n",
-            "6896312          6.12                    0.3         41.99   \n",
-            "6896313          6.12                    0.3         61.30   \n",
-            "6896314          6.12                    0.3         47.69   \n",
-            "6896315          6.12                    0.3         51.03   \n",
-            "6896316          0.00                    0.3         48.17   \n",
-            "\n",
-            "         congestion_surcharge  \n",
-            "0                         NaN  \n",
-            "1                         NaN  \n",
-            "2                         NaN  \n",
-            "3                         NaN  \n",
-            "4                         NaN  \n",
-            "...                       ...  \n",
-            "6896312                   0.0  \n",
-            "6896313                   0.0  \n",
-            "6896314                   0.0  \n",
-            "6896315                   0.0  \n",
-            "6896316                   0.0  \n",
-            "\n",
-            "[40908284 rows x 18 columns]\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "metadata": {}
     }
   ],
diff --git a/4-Data-Science-Lifecycle/15-analyzing/assignment.md b/4-Data-Science-Lifecycle/15-analyzing/assignment.md
index 5bbf35c..b23f76a 100644
--- a/4-Data-Science-Lifecycle/15-analyzing/assignment.md
+++ b/4-Data-Science-Lifecycle/15-analyzing/assignment.md
@@ -1,18 +1,16 @@
-# Exploration with two data sets
+# Analyzing for answers
 
-A client has approached your team for help in investigating a taxi customer's seasonal spending habits in New York City. 
+This continues the process of the lifecycle
 
 They want to know: **Do yellow taxi passengers in New York City tip drivers more in the winter or summer?**
 
-Your team is in the [Capturing](Readme.md#Capturing) stage of the Data Science Lifecycle and you are in charge of exploring the data. You have been provided a notebook and data from Azure Open Datasets to explore. You have decided to begin by exploring taxi data in the year 2019. For summer you choose June, July, and August and for winter you choose January, February, and December.
+Your team is in the [Analyzing](Readme.md) stage of the Data Science Lifecycle.. You have been provided a notebook and data from Azure Open Datasets to explore.  For summer you choose June, July, and August and for winter you choose January, February, and December.
 
 ## Instructions
 
 In this directory is a [notebook](notebook.ipynb) that uses Python to load 6 months of yellow taxi trip data from the [NYC Taxi & Limousine Commission](https://docs.microsoft.com/en-us/azure/open-datasets/dataset-taxi-yellow?tabs=azureml-opendatasets) and Integrated Surface Data from NOAA. These datasets have been joined together in a Pandas dataframe.
 
-Your task is to identify the columns that are the most likely required to answer this question, then reorganize the joined dataset so that these columns are displayed first.
-
-Finally, write 3 questions that you would ask the client for more clarification and better understanding of the problem.
+Your task is to ___
 
 ## Rubric
 
diff --git a/4-Data-Science-Lifecycle/15-analyzing/notebook.ipynb b/4-Data-Science-Lifecycle/15-analyzing/notebook.ipynb
index e69de29..c526e80 100644
--- a/4-Data-Science-Lifecycle/15-analyzing/notebook.ipynb
+++ b/4-Data-Science-Lifecycle/15-analyzing/notebook.ipynb
@@ -0,0 +1,25 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "source": [
+    "# print(pd.read_csv('../../data/Taxi/yellow_tripdata_2019-01.csv'))\r\n",
+    "# all_files = glob.glob('../../data/Taxi/*.csv')\r\n",
+    "\r\n",
+    "# df = pd.concat((pd.read_csv(f) for f in all_files))\r\n",
+    "# print(df)"
+   ],
+   "outputs": [],
+   "metadata": {}
+  }
+ ],
+ "metadata": {
+  "orig_nbformat": 4,
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file