You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Data-Science-For-Beginners/examples/02_loading_data.py

129 lines
3.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
Loading and Exploring Data
In real data science projects, you'll work with data stored in files.
This example shows you how to load data from a CSV file and explore it.
What you'll learn:
- How to load data from a CSV file
- How to view basic information about your dataset
- How to display the first/last rows
- How to get summary statistics
Prerequisites: pandas library (install with: pip install pandas)
"""
# Import the pandas library - it's the most popular tool for working with data in Python
# We give it the short name 'pd' so we can type less
import pandas as pd
print("=" * 70)
print("Welcome to Data Loading and Exploration!")
print("=" * 70)
print()
# Step 1: Load data from a CSV file
# CSV stands for "Comma-Separated Values" - a common format for storing data
# We'll use the birds dataset that comes with this repository
print("📂 Loading data from birds.csv...")
print()
# Load the data into a DataFrame (think of it as a smart spreadsheet)
# A DataFrame is pandas' main data structure - it organizes data in rows and columns
data = pd.read_csv('../data/birds.csv')
print("✅ Data loaded successfully!")
print()
# Step 2: Get basic information about the dataset
print("-" * 70)
print("BASIC DATASET INFORMATION")
print("-" * 70)
# How many rows and columns do we have?
num_rows, num_columns = data.shape
print(f"📊 Dataset size: {num_rows} rows × {num_columns} columns")
print()
# What are the column names?
print("📋 Column names:")
for i, column in enumerate(data.columns, 1):
print(f" {i}. {column}")
print()
# Step 3: Look at the first few rows of data
# This gives us a quick preview of what the data looks like
print("-" * 70)
print("FIRST 5 ROWS OF DATA (Preview)")
print("-" * 70)
print(data.head()) # head() shows the first 5 rows by default
print()
# Step 4: Look at the last few rows
print("-" * 70)
print("LAST 3 ROWS OF DATA")
print("-" * 70)
print(data.tail(3)) # tail(3) shows the last 3 rows
print()
# Step 5: Get information about data types
print("-" * 70)
print("DATA TYPES AND NON-NULL COUNTS")
print("-" * 70)
print(data.info()) # Shows column names, data types, and count of non-null values
print()
# Step 6: Get statistical summary
print("-" * 70)
print("STATISTICAL SUMMARY (for numerical columns)")
print("-" * 70)
# describe() gives us statistics like mean, std, min, max, etc.
print(data.describe())
print()
# Step 7: Check for missing values
print("-" * 70)
print("MISSING VALUES CHECK")
print("-" * 70)
missing_values = data.isnull().sum()
print("Number of missing values per column:")
print(missing_values)
print()
if missing_values.sum() == 0:
print("✅ Great! No missing values found.")
else:
print("⚠️ Some columns have missing values. You may need to handle them.")
print()
# Step 8: Get unique values in a column
print("-" * 70)
print("SAMPLE: UNIQUE VALUES")
print("-" * 70)
# Let's see what unique values exist in the first column
first_column = data.columns[0]
unique_count = data[first_column].nunique()
print(f"The column '{first_column}' has {unique_count} unique value(s)")
print()
# Summary
print("=" * 70)
print("SUMMARY")
print("=" * 70)
print("You've learned how to:")
print(" ✓ Load data from a CSV file using pandas")
print(" ✓ Check the size and shape of your dataset")
print(" ✓ View the first and last rows")
print(" ✓ Understand data types")
print(" ✓ Get statistical summaries")
print(" ✓ Check for missing values")
print()
print("Next step: Try loading other CSV files from the data/ folder!")
print("=" * 70)
# Pro Tips:
# - Always explore your data before analyzing it
# - Check for missing values and understand why they might be missing
# - Look at the data types to ensure they make sense
# - Use head() and tail() to spot any obvious issues with your data