|
|
"""
|
|
|
Loading and Exploring Data
|
|
|
|
|
|
In real data science projects, you'll work with data stored in files.
|
|
|
This example shows you how to load data from a CSV file and explore it.
|
|
|
|
|
|
What you'll learn:
|
|
|
- How to load data from a CSV file
|
|
|
- How to view basic information about your dataset
|
|
|
- How to display the first/last rows
|
|
|
- How to get summary statistics
|
|
|
|
|
|
Prerequisites: pandas library (install with: pip install pandas)
|
|
|
"""
|
|
|
|
|
|
# Import the pandas library - it's the most popular tool for working with data in Python
|
|
|
# We give it the short name 'pd' so we can type less
|
|
|
import pandas as pd
|
|
|
|
|
|
print("=" * 70)
|
|
|
print("Welcome to Data Loading and Exploration!")
|
|
|
print("=" * 70)
|
|
|
print()
|
|
|
|
|
|
# Step 1: Load data from a CSV file
|
|
|
# CSV stands for "Comma-Separated Values" - a common format for storing data
|
|
|
# We'll use the birds dataset that comes with this repository
|
|
|
print("📂 Loading data from birds.csv...")
|
|
|
print()
|
|
|
|
|
|
# Load the data into a DataFrame (think of it as a smart spreadsheet)
|
|
|
# A DataFrame is pandas' main data structure - it organizes data in rows and columns
|
|
|
data = pd.read_csv('../data/birds.csv')
|
|
|
|
|
|
print("✅ Data loaded successfully!")
|
|
|
print()
|
|
|
|
|
|
# Step 2: Get basic information about the dataset
|
|
|
print("-" * 70)
|
|
|
print("BASIC DATASET INFORMATION")
|
|
|
print("-" * 70)
|
|
|
|
|
|
# How many rows and columns do we have?
|
|
|
num_rows, num_columns = data.shape
|
|
|
print(f"📊 Dataset size: {num_rows} rows × {num_columns} columns")
|
|
|
print()
|
|
|
|
|
|
# What are the column names?
|
|
|
print("📋 Column names:")
|
|
|
for i, column in enumerate(data.columns, 1):
|
|
|
print(f" {i}. {column}")
|
|
|
print()
|
|
|
|
|
|
# Step 3: Look at the first few rows of data
|
|
|
# This gives us a quick preview of what the data looks like
|
|
|
print("-" * 70)
|
|
|
print("FIRST 5 ROWS OF DATA (Preview)")
|
|
|
print("-" * 70)
|
|
|
print(data.head()) # head() shows the first 5 rows by default
|
|
|
print()
|
|
|
|
|
|
# Step 4: Look at the last few rows
|
|
|
print("-" * 70)
|
|
|
print("LAST 3 ROWS OF DATA")
|
|
|
print("-" * 70)
|
|
|
print(data.tail(3)) # tail(3) shows the last 3 rows
|
|
|
print()
|
|
|
|
|
|
# Step 5: Get information about data types
|
|
|
print("-" * 70)
|
|
|
print("DATA TYPES AND NON-NULL COUNTS")
|
|
|
print("-" * 70)
|
|
|
print(data.info()) # Shows column names, data types, and count of non-null values
|
|
|
print()
|
|
|
|
|
|
# Step 6: Get statistical summary
|
|
|
print("-" * 70)
|
|
|
print("STATISTICAL SUMMARY (for numerical columns)")
|
|
|
print("-" * 70)
|
|
|
# describe() gives us statistics like mean, std, min, max, etc.
|
|
|
print(data.describe())
|
|
|
print()
|
|
|
|
|
|
# Step 7: Check for missing values
|
|
|
print("-" * 70)
|
|
|
print("MISSING VALUES CHECK")
|
|
|
print("-" * 70)
|
|
|
missing_values = data.isnull().sum()
|
|
|
print("Number of missing values per column:")
|
|
|
print(missing_values)
|
|
|
print()
|
|
|
|
|
|
if missing_values.sum() == 0:
|
|
|
print("✅ Great! No missing values found.")
|
|
|
else:
|
|
|
print("⚠️ Some columns have missing values. You may need to handle them.")
|
|
|
print()
|
|
|
|
|
|
# Step 8: Get unique values in a column
|
|
|
print("-" * 70)
|
|
|
print("SAMPLE: UNIQUE VALUES")
|
|
|
print("-" * 70)
|
|
|
# Let's see what unique values exist in the first column
|
|
|
first_column = data.columns[0]
|
|
|
unique_count = data[first_column].nunique()
|
|
|
print(f"The column '{first_column}' has {unique_count} unique value(s)")
|
|
|
print()
|
|
|
|
|
|
# Summary
|
|
|
print("=" * 70)
|
|
|
print("SUMMARY")
|
|
|
print("=" * 70)
|
|
|
print("You've learned how to:")
|
|
|
print(" ✓ Load data from a CSV file using pandas")
|
|
|
print(" ✓ Check the size and shape of your dataset")
|
|
|
print(" ✓ View the first and last rows")
|
|
|
print(" ✓ Understand data types")
|
|
|
print(" ✓ Get statistical summaries")
|
|
|
print(" ✓ Check for missing values")
|
|
|
print()
|
|
|
print("Next step: Try loading other CSV files from the data/ folder!")
|
|
|
print("=" * 70)
|
|
|
|
|
|
# Pro Tips:
|
|
|
# - Always explore your data before analyzing it
|
|
|
# - Check for missing values and understand why they might be missing
|
|
|
# - Look at the data types to ensure they make sense
|
|
|
# - Use head() and tail() to spot any obvious issues with your data
|