You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Data-Science-For-Beginners/4-Data-Science-Lifecycle/14-Introduction/notebook.ipynb

152 lines
6.9 KiB

{
"cells": [
{
"cell_type": "markdown",
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\r\n",
"\r\n",
"Licensed under the MIT License."
],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"# Exploring NYC Taxi data in Winter and Summer"
],
"metadata": {}
},
{
"cell_type": "markdown",
"source": [
"Install azureml-opendatasets package"
],
"metadata": {}
},
{
"cell_type": "code",
"execution_count": null,
"source": [
"!pip install pandas"
],
"outputs": [],
"metadata": {
"scrolled": true
}
},
{
"cell_type": "code",
"execution_count": 19,
"source": [
"import pandas as pd\r\n",
"import glob\r\n",
"\r\n",
"# print(pd.read_csv('../../data/Taxi/yellow_tripdata_2019-01.csv'))\r\n",
"all_files = glob.glob('../../data/Taxi/*.csv')\r\n",
"\r\n",
"df = pd.concat((pd.read_csv(f) for f in all_files))\r\n",
"print(df)"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n",
"0 1.0 2019-01-01 00:46:40 2019-01-01 00:53:20 1.0 \n",
"1 1.0 2019-01-01 00:59:47 2019-01-01 01:18:59 1.0 \n",
"2 2.0 2018-12-21 13:48:30 2018-12-21 13:52:40 3.0 \n",
"3 2.0 2018-11-28 15:52:25 2018-11-28 15:55:45 5.0 \n",
"4 2.0 2018-11-28 15:56:57 2018-11-28 15:58:33 5.0 \n",
"... ... ... ... ... \n",
"6896312 NaN 2019-12-31 00:07:00 2019-12-31 00:46:00 NaN \n",
"6896313 NaN 2019-12-31 00:20:00 2019-12-31 00:47:00 NaN \n",
"6896314 NaN 2019-12-31 00:50:00 2019-12-31 01:21:00 NaN \n",
"6896315 NaN 2019-12-31 00:38:19 2019-12-31 01:19:37 NaN \n",
"6896316 NaN 2019-12-31 00:21:00 2019-12-31 00:56:00 NaN \n",
"\n",
" trip_distance RatecodeID store_and_fwd_flag PULocationID \\\n",
"0 1.50 1.0 N 151 \n",
"1 2.60 1.0 N 239 \n",
"2 0.00 1.0 N 236 \n",
"3 0.00 1.0 N 193 \n",
"4 0.00 2.0 N 193 \n",
"... ... ... ... ... \n",
"6896312 12.78 NaN NaN 230 \n",
"6896313 18.52 NaN NaN 219 \n",
"6896314 13.13 NaN NaN 161 \n",
"6896315 14.51 NaN NaN 230 \n",
"6896316 -17.16 NaN NaN 193 \n",
"\n",
" DOLocationID payment_type fare_amount extra mta_tax tip_amount \\\n",
"0 239 1.0 7.00 0.50 0.5 1.65 \n",
"1 246 1.0 14.00 0.50 0.5 1.00 \n",
"2 236 1.0 4.50 0.50 0.5 0.00 \n",
"3 193 2.0 3.50 0.50 0.5 0.00 \n",
"4 193 2.0 52.00 0.00 0.5 0.00 \n",
"... ... ... ... ... ... ... \n",
"6896312 72 NaN 32.32 2.75 0.5 0.00 \n",
"6896313 32 NaN 51.63 2.75 0.5 0.00 \n",
"6896314 76 NaN 38.02 2.75 0.5 0.00 \n",
"6896315 21 NaN 41.86 2.75 0.0 0.00 \n",
"6896316 219 NaN 44.62 2.75 0.5 0.00 \n",
"\n",
" tolls_amount improvement_surcharge total_amount \\\n",
"0 0.00 0.3 9.95 \n",
"1 0.00 0.3 16.30 \n",
"2 0.00 0.3 5.80 \n",
"3 0.00 0.3 7.55 \n",
"4 0.00 0.3 55.55 \n",
"... ... ... ... \n",
"6896312 6.12 0.3 41.99 \n",
"6896313 6.12 0.3 61.30 \n",
"6896314 6.12 0.3 47.69 \n",
"6896315 6.12 0.3 51.03 \n",
"6896316 0.00 0.3 48.17 \n",
"\n",
" congestion_surcharge \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"... ... \n",
"6896312 0.0 \n",
"6896313 0.0 \n",
"6896314 0.0 \n",
"6896315 0.0 \n",
"6896316 0.0 \n",
"\n",
"[40908284 rows x 18 columns]\n"
]
}
],
"metadata": {}
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": "Python 3.9.7 64-bit ('venv': venv)"
},
"language_info": {
"mimetype": "text/x-python",
"name": "python",
"pygments_lexer": "ipython3",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"version": "3.9.7",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"name": "04-nyc-taxi-join-weather-in-pandas",
"notebookId": 1709144033725344,
"interpreter": {
"hash": "6b9b57232c4b57163d057191678da2030059e733b8becc68f245de5a75abe84e"
}
},
"nbformat": 4,
"nbformat_minor": 2
}