You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Data-Science-For-Beginners/examples/03_simple_analysis.py

175 lines
5.4 KiB

"""
Simple Data Analysis
Learn how to analyze data and answer questions about it.
This example demonstrates common data analysis operations.
What you'll learn:
- How to calculate statistics on your data
- How to filter data based on conditions
- How to group and aggregate data
- How to sort data
Prerequisites: pandas library (install with: pip install pandas)
"""
import pandas as pd
print("=" * 70)
print("Simple Data Analysis Tutorial")
print("=" * 70)
print()
# Load a dataset - we'll use the honey production data
print("📂 Loading honey production data...")
data = pd.read_csv('../data/honey.csv')
print("✅ Data loaded!\n")
# Quick look at the data
print("-" * 70)
print("FIRST FEW ROWS")
print("-" * 70)
print(data.head(3))
print()
# SECTION 1: Basic Statistics
print("=" * 70)
print("SECTION 1: CALCULATING STATISTICS")
print("=" * 70)
print()
# Let's look at the 'totalprod' column (total production)
if 'totalprod' in data.columns:
total_production = data['totalprod']
print("Total Honey Production Statistics:")
print("-" * 70)
print(f" Mean (Average): {total_production.mean():,.2f}")
print(f" Median (Middle): {total_production.median():,.2f}")
print(f" Mode (Most common): {total_production.mode().values[0]:,.2f}")
print(f" Std Dev: {total_production.std():,.2f}")
print(f" Minimum: {total_production.min():,.2f}")
print(f" Maximum: {total_production.max():,.2f}")
print()
# SECTION 2: Filtering Data
print("=" * 70)
print("SECTION 2: FILTERING DATA")
print("=" * 70)
print()
# Let's filter the data to show only records from a specific year
if 'year' in data.columns:
year_to_filter = 2000
filtered_data = data[data['year'] == year_to_filter]
print(f"Showing data for year {year_to_filter}:")
print("-" * 70)
print(f"Found {len(filtered_data)} records")
print()
print(filtered_data.head())
print()
# Filter based on multiple conditions
if 'totalprod' in data.columns and 'year' in data.columns:
# Find records where production was above 10 million pounds after 2010
high_production = data[(data['totalprod'] > 10000000) & (data['year'] > 2010)]
print("High production years (>10M pounds after 2010):")
print("-" * 70)
print(f"Found {len(high_production)} records")
print()
# SECTION 3: Grouping and Aggregating
print("=" * 70)
print("SECTION 3: GROUPING AND AGGREGATING DATA")
print("=" * 70)
print()
# Group by state and calculate average production
if 'state' in data.columns and 'totalprod' in data.columns:
# Group the data by state and calculate mean production
state_averages = data.groupby('state')['totalprod'].mean()
# Sort to see which states have highest average production
state_averages_sorted = state_averages.sort_values(ascending=False)
print("Top 10 States by Average Honey Production:")
print("-" * 70)
for i, (state, avg_prod) in enumerate(state_averages_sorted.head(10).items(), 1):
print(f"{i:2d}. {state:20s} {avg_prod:,.0f} pounds")
print()
# SECTION 4: Sorting Data
print("=" * 70)
print("SECTION 4: SORTING DATA")
print("=" * 70)
print()
if 'totalprod' in data.columns:
# Sort by total production in descending order
sorted_data = data.sort_values('totalprod', ascending=False)
print("Records with Highest Production:")
print("-" * 70)
# Show the top 5 records
columns_to_show = ['state', 'year', 'totalprod'] if all(col in data.columns for col in ['state', 'year', 'totalprod']) else data.columns[:3]
print(sorted_data[columns_to_show].head())
print()
# SECTION 5: Counting Values
print("=" * 70)
print("SECTION 5: COUNTING VALUES")
print("=" * 70)
print()
if 'state' in data.columns:
# Count how many records we have for each state
state_counts = data['state'].value_counts()
print("Number of records per state (top 10):")
print("-" * 70)
for state, count in state_counts.head(10).items():
print(f"{state:20s} {count:3d} records")
print()
# SECTION 6: Answering a Question
print("=" * 70)
print("SECTION 6: ANSWERING A REAL QUESTION")
print("=" * 70)
print()
# Question: Which state had the highest honey production in 2012?
if all(col in data.columns for col in ['state', 'year', 'totalprod']):
year_2012 = data[data['year'] == 2012]
if len(year_2012) > 0:
# Find the row with maximum production in 2012
max_prod_idx = year_2012['totalprod'].idxmax()
max_prod_state = year_2012.loc[max_prod_idx, 'state']
max_prod_amount = year_2012.loc[max_prod_idx, 'totalprod']
print("Question: Which state had the highest honey production in 2012?")
print("-" * 70)
print(f"Answer: {max_prod_state}")
print(f"Production: {max_prod_amount:,.0f} pounds")
print()
# Summary
print("=" * 70)
print("CONGRATULATIONS!")
print("=" * 70)
print("You've learned how to:")
print(" ✓ Calculate basic statistics (mean, median, mode, etc.)")
print(" ✓ Filter data based on conditions")
print(" ✓ Group data and calculate aggregates")
print(" ✓ Sort data to find top/bottom values")
print(" ✓ Count occurrences of values")
print(" ✓ Answer real questions using data")
print()
print("Try this yourself:")
print(" • Find the state with the lowest average production")
print(" • Calculate total production by year")
print(" • Find trends over time")
print("=" * 70)