You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Data-Science-For-Beginners/examples/05_real_world_example.py

253 lines
7.9 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

"""
Real World Example: Complete Data Science Workflow
This example ties everything together, showing you a complete data science project
from start to finish. We'll analyze bird strike data to answer real questions.
What you'll learn:
- How to approach a data science problem
- Complete workflow: Load → Clean → Analyze → Visualize → Conclude
- How to handle real-world data issues
- How to draw meaningful conclusions
Prerequisites:
- pandas library (install with: pip install pandas)
- matplotlib library (install with: pip install matplotlib)
Real-world context:
Bird strikes (when birds collide with aircraft) are a safety concern for aviation.
Let's analyze bird strike data to understand patterns and risks.
"""
import pandas as pd
import matplotlib.pyplot as plt
print("=" * 80)
print("REAL WORLD DATA SCIENCE PROJECT: BIRD STRIKE ANALYSIS")
print("=" * 80)
print()
# STEP 1: DEFINE THE PROBLEM
print("=" * 80)
print("STEP 1: DEFINE THE PROBLEM")
print("=" * 80)
print()
print("Questions we want to answer:")
print(" 1. How many bird strikes occur?")
print(" 2. When do bird strikes most commonly occur?")
print(" 3. What are the most common bird species involved?")
print(" 4. What is the typical damage level?")
print()
input("Press Enter to continue...")
print()
# STEP 2: LOAD THE DATA
print("=" * 80)
print("STEP 2: LOAD THE DATA")
print("=" * 80)
print()
print("📂 Loading bird strike data...")
try:
data = pd.read_csv('../data/birds.csv')
print(f"✅ Successfully loaded {len(data)} records")
print()
except FileNotFoundError:
print("❌ Error: birds.csv not found in data/ folder")
print("Please make sure the data file exists.")
exit(1)
# STEP 3: EXPLORE THE DATA
print("=" * 80)
print("STEP 3: EXPLORE THE DATA")
print("=" * 80)
print()
print("Dataset Information:")
print("-" * 80)
print(f" • Shape: {data.shape[0]} rows × {data.shape[1]} columns")
print(f" • Columns: {', '.join(data.columns.tolist())}")
print()
print("First few rows:")
print("-" * 80)
print(data.head(3))
print()
print("Data types:")
print("-" * 80)
print(data.dtypes)
print()
# STEP 4: CLEAN THE DATA
print("=" * 80)
print("STEP 4: CLEAN THE DATA")
print("=" * 80)
print()
# Check for missing values
print("Checking for missing values...")
missing_counts = data.isnull().sum()
missing_percentage = (missing_counts / len(data)) * 100
print("-" * 80)
for column in data.columns:
if missing_counts[column] > 0:
print(f"{column}: {missing_counts[column]} missing ({missing_percentage[column]:.1f}%)")
if missing_counts.sum() == 0:
print(" ✅ No missing values found!")
print()
# Handle missing values in a specific column if needed
# For this example, we'll work with the data as-is, but in real projects
# you might need to fill or remove missing values
print("Data cleaning notes:")
print("-" * 80)
print(" • In a real project, you would:")
print(" - Decide how to handle missing values (remove, fill, or keep)")
print(" - Check for duplicate records")
print(" - Validate data types")
print(" - Look for outliers or incorrect values")
print(" • For this example, we'll proceed with the data as-is")
print()
# STEP 5: ANALYZE THE DATA
print("=" * 80)
print("STEP 5: ANALYZE THE DATA")
print("=" * 80)
print()
# Analysis 1: Total number of incidents
total_strikes = len(data)
print("Analysis 1: Overview")
print("-" * 80)
print(f" • Total bird strikes recorded: {total_strikes:,}")
print()
# Analysis 2: Find the most common bird species
if 'Bird Species' in data.columns:
print("Analysis 2: Most Common Bird Species")
print("-" * 80)
top_species = data['Bird Species'].value_counts().head(5)
for i, (species, count) in enumerate(top_species.items(), 1):
print(f" {i}. {species}: {count} strikes ({count/total_strikes*100:.1f}%)")
print()
# Analysis 3: Analyze by time (if time column exists)
time_column = None
for col in ['FlightDate', 'Date', 'Time']:
if col in data.columns:
time_column = col
break
if time_column:
print(f"Analysis 3: Temporal Analysis")
print("-" * 80)
print(f" • Using column: {time_column}")
# Additional time analysis could go here
print()
# STEP 6: VISUALIZE THE DATA
print("=" * 80)
print("STEP 6: VISUALIZE THE DATA")
print("=" * 80)
print()
# Visualization 1: Top bird species
if 'Bird Species' in data.columns:
print("Creating visualization 1: Top 10 Bird Species...")
top_10_species = data['Bird Species'].value_counts().head(10)
plt.figure(figsize=(12, 6))
plt.barh(range(len(top_10_species)), top_10_species.values, color='steelblue')
plt.yticks(range(len(top_10_species)), top_10_species.index)
plt.xlabel('Number of Strikes', fontsize=12)
plt.ylabel('Bird Species', fontsize=12)
plt.title('Top 10 Bird Species Involved in Strikes', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('birds_top_species.png', dpi=300, bbox_inches='tight')
plt.close()
print(" ✅ Saved as 'birds_top_species.png'")
print()
# Visualization 2: Distribution of another variable
# Check what columns are available for interesting visualizations
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
if len(numeric_columns) > 0:
print("Creating visualization 2: Numeric data distribution...")
col_to_plot = numeric_columns[0]
plt.figure(figsize=(10, 6))
data[col_to_plot].hist(bins=30, color='teal', edgecolor='black', alpha=0.7)
plt.xlabel(col_to_plot, fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title(f'Distribution of {col_to_plot}', fontsize=14, fontweight='bold')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('birds_distribution.png', dpi=300, bbox_inches='tight')
plt.close()
print(f" ✅ Saved as 'birds_distribution.png'")
print()
# STEP 7: DRAW CONCLUSIONS
print("=" * 80)
print("STEP 7: DRAW CONCLUSIONS")
print("=" * 80)
print()
print("Key Findings:")
print("-" * 80)
print(f" 1. We analyzed {total_strikes:,} bird strike incidents")
if 'Bird Species' in data.columns:
most_common_species = data['Bird Species'].value_counts().index[0]
most_common_count = data['Bird Species'].value_counts().values[0]
print(f" 2. Most common species: {most_common_species} ({most_common_count} incidents)")
print()
print("Implications:")
print("-" * 80)
print(" • This data can help airports implement targeted bird control measures")
print(" • Understanding patterns helps improve aircraft safety procedures")
print(" • Airlines can use this data for pilot training and awareness")
print()
# STEP 8: NEXT STEPS
print("=" * 80)
print("STEP 8: NEXT STEPS & RECOMMENDATIONS")
print("=" * 80)
print()
print("What you could do next:")
print(" • Analyze temporal patterns (time of day, season)")
print(" • Investigate geographical patterns (airports, regions)")
print(" • Correlate with aircraft types or flight phases")
print(" • Build a predictive model for high-risk conditions")
print(" • Create an interactive dashboard for stakeholders")
print()
# FINAL SUMMARY
print("=" * 80)
print("CONGRATULATIONS! YOU'VE COMPLETED A REAL DATA SCIENCE PROJECT!")
print("=" * 80)
print()
print("You practiced the complete data science workflow:")
print(" ✓ Step 1: Defined clear questions to answer")
print(" ✓ Step 2: Loaded real-world data")
print(" ✓ Step 3: Explored the data structure")
print(" ✓ Step 4: Cleaned and prepared the data")
print(" ✓ Step 5: Analyzed the data to find patterns")
print(" ✓ Step 6: Visualized findings")
print(" ✓ Step 7: Drew meaningful conclusions")
print(" ✓ Step 8: Identified next steps")
print()
print("This is the same process data scientists use in real projects!")
print()
print("Your next challenge:")
print(" • Try this workflow with other datasets in the data/ folder")
print(" • Come up with your own questions and find answers")
print(" • Share your findings with others")
print("=" * 80)