""" Loading and Exploring Data In real data science projects, you'll work with data stored in files. This example shows you how to load data from a CSV file and explore it. What you'll learn: - How to load data from a CSV file - How to view basic information about your dataset - How to display the first/last rows - How to get summary statistics Prerequisites: pandas library (install with: pip install pandas) """ # Import the pandas library - it's the most popular tool for working with data in Python # We give it the short name 'pd' so we can type less import pandas as pd print("=" * 70) print("Welcome to Data Loading and Exploration!") print("=" * 70) print() # Step 1: Load data from a CSV file # CSV stands for "Comma-Separated Values" - a common format for storing data # We'll use the birds dataset that comes with this repository print("📂 Loading data from birds.csv...") print() # Load the data into a DataFrame (think of it as a smart spreadsheet) # A DataFrame is pandas' main data structure - it organizes data in rows and columns data = pd.read_csv('../data/birds.csv') print("✅ Data loaded successfully!") print() # Step 2: Get basic information about the dataset print("-" * 70) print("BASIC DATASET INFORMATION") print("-" * 70) # How many rows and columns do we have? num_rows, num_columns = data.shape print(f"📊 Dataset size: {num_rows} rows × {num_columns} columns") print() # What are the column names? print("📋 Column names:") for i, column in enumerate(data.columns, 1): print(f" {i}. {column}") print() # Step 3: Look at the first few rows of data # This gives us a quick preview of what the data looks like print("-" * 70) print("FIRST 5 ROWS OF DATA (Preview)") print("-" * 70) print(data.head()) # head() shows the first 5 rows by default print() # Step 4: Look at the last few rows print("-" * 70) print("LAST 3 ROWS OF DATA") print("-" * 70) print(data.tail(3)) # tail(3) shows the last 3 rows print() # Step 5: Get information about data types print("-" * 70) print("DATA TYPES AND NON-NULL COUNTS") print("-" * 70) print(data.info()) # Shows column names, data types, and count of non-null values print() # Step 6: Get statistical summary print("-" * 70) print("STATISTICAL SUMMARY (for numerical columns)") print("-" * 70) # describe() gives us statistics like mean, std, min, max, etc. print(data.describe()) print() # Step 7: Check for missing values print("-" * 70) print("MISSING VALUES CHECK") print("-" * 70) missing_values = data.isnull().sum() print("Number of missing values per column:") print(missing_values) print() if missing_values.sum() == 0: print("✅ Great! No missing values found.") else: print("⚠️ Some columns have missing values. You may need to handle them.") print() # Step 8: Get unique values in a column print("-" * 70) print("SAMPLE: UNIQUE VALUES") print("-" * 70) # Let's see what unique values exist in the first column first_column = data.columns[0] unique_count = data[first_column].nunique() print(f"The column '{first_column}' has {unique_count} unique value(s)") print() # Summary print("=" * 70) print("SUMMARY") print("=" * 70) print("You've learned how to:") print(" ✓ Load data from a CSV file using pandas") print(" ✓ Check the size and shape of your dataset") print(" ✓ View the first and last rows") print(" ✓ Understand data types") print(" ✓ Get statistical summaries") print(" ✓ Check for missing values") print() print("Next step: Try loading other CSV files from the data/ folder!") print("=" * 70) # Pro Tips: # - Always explore your data before analyzing it # - Check for missing values and understand why they might be missing # - Look at the data types to ensure they make sense # - Use head() and tail() to spot any obvious issues with your data