""" Real World Example: Complete Data Science Workflow This example ties everything together, showing you a complete data science project from start to finish. We'll analyze bird strike data to answer real questions. What you'll learn: - How to approach a data science problem - Complete workflow: Load → Clean → Analyze → Visualize → Conclude - How to handle real-world data issues - How to draw meaningful conclusions Prerequisites: - pandas library (install with: pip install pandas) - matplotlib library (install with: pip install matplotlib) Real-world context: Bird strikes (when birds collide with aircraft) are a safety concern for aviation. Let's analyze bird strike data to understand patterns and risks. """ import pandas as pd import matplotlib.pyplot as plt print("=" * 80) print("REAL WORLD DATA SCIENCE PROJECT: BIRD STRIKE ANALYSIS") print("=" * 80) print() # STEP 1: DEFINE THE PROBLEM print("=" * 80) print("STEP 1: DEFINE THE PROBLEM") print("=" * 80) print() print("Questions we want to answer:") print(" 1. How many bird strikes occur?") print(" 2. When do bird strikes most commonly occur?") print(" 3. What are the most common bird species involved?") print(" 4. What is the typical damage level?") print() input("Press Enter to continue...") print() # STEP 2: LOAD THE DATA print("=" * 80) print("STEP 2: LOAD THE DATA") print("=" * 80) print() print("📂 Loading bird strike data...") try: data = pd.read_csv('../data/birds.csv') print(f"✅ Successfully loaded {len(data)} records") print() except FileNotFoundError: print("❌ Error: birds.csv not found in data/ folder") print("Please make sure the data file exists.") exit(1) # STEP 3: EXPLORE THE DATA print("=" * 80) print("STEP 3: EXPLORE THE DATA") print("=" * 80) print() print("Dataset Information:") print("-" * 80) print(f" • Shape: {data.shape[0]} rows × {data.shape[1]} columns") print(f" • Columns: {', '.join(data.columns.tolist())}") print() print("First few rows:") print("-" * 80) print(data.head(3)) print() print("Data types:") print("-" * 80) print(data.dtypes) print() # STEP 4: CLEAN THE DATA print("=" * 80) print("STEP 4: CLEAN THE DATA") print("=" * 80) print() # Check for missing values print("Checking for missing values...") missing_counts = data.isnull().sum() missing_percentage = (missing_counts / len(data)) * 100 print("-" * 80) for column in data.columns: if missing_counts[column] > 0: print(f" • {column}: {missing_counts[column]} missing ({missing_percentage[column]:.1f}%)") if missing_counts.sum() == 0: print(" ✅ No missing values found!") print() # Handle missing values in a specific column if needed # For this example, we'll work with the data as-is, but in real projects # you might need to fill or remove missing values print("Data cleaning notes:") print("-" * 80) print(" • In a real project, you would:") print(" - Decide how to handle missing values (remove, fill, or keep)") print(" - Check for duplicate records") print(" - Validate data types") print(" - Look for outliers or incorrect values") print(" • For this example, we'll proceed with the data as-is") print() # STEP 5: ANALYZE THE DATA print("=" * 80) print("STEP 5: ANALYZE THE DATA") print("=" * 80) print() # Analysis 1: Total number of incidents total_strikes = len(data) print("Analysis 1: Overview") print("-" * 80) print(f" • Total bird strikes recorded: {total_strikes:,}") print() # Analysis 2: Find the most common bird species if 'Bird Species' in data.columns: print("Analysis 2: Most Common Bird Species") print("-" * 80) top_species = data['Bird Species'].value_counts().head(5) for i, (species, count) in enumerate(top_species.items(), 1): print(f" {i}. {species}: {count} strikes ({count/total_strikes*100:.1f}%)") print() # Analysis 3: Analyze by time (if time column exists) time_column = None for col in ['FlightDate', 'Date', 'Time']: if col in data.columns: time_column = col break if time_column: print(f"Analysis 3: Temporal Analysis") print("-" * 80) print(f" • Using column: {time_column}") # Additional time analysis could go here print() # STEP 6: VISUALIZE THE DATA print("=" * 80) print("STEP 6: VISUALIZE THE DATA") print("=" * 80) print() # Visualization 1: Top bird species if 'Bird Species' in data.columns: print("Creating visualization 1: Top 10 Bird Species...") top_10_species = data['Bird Species'].value_counts().head(10) plt.figure(figsize=(12, 6)) plt.barh(range(len(top_10_species)), top_10_species.values, color='steelblue') plt.yticks(range(len(top_10_species)), top_10_species.index) plt.xlabel('Number of Strikes', fontsize=12) plt.ylabel('Bird Species', fontsize=12) plt.title('Top 10 Bird Species Involved in Strikes', fontsize=14, fontweight='bold') plt.grid(axis='x', alpha=0.3) plt.tight_layout() plt.savefig('birds_top_species.png', dpi=300, bbox_inches='tight') plt.close() print(" ✅ Saved as 'birds_top_species.png'") print() # Visualization 2: Distribution of another variable # Check what columns are available for interesting visualizations numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns if len(numeric_columns) > 0: print("Creating visualization 2: Numeric data distribution...") col_to_plot = numeric_columns[0] plt.figure(figsize=(10, 6)) data[col_to_plot].hist(bins=30, color='teal', edgecolor='black', alpha=0.7) plt.xlabel(col_to_plot, fontsize=12) plt.ylabel('Frequency', fontsize=12) plt.title(f'Distribution of {col_to_plot}', fontsize=14, fontweight='bold') plt.grid(axis='y', alpha=0.3) plt.tight_layout() plt.savefig('birds_distribution.png', dpi=300, bbox_inches='tight') plt.close() print(f" ✅ Saved as 'birds_distribution.png'") print() # STEP 7: DRAW CONCLUSIONS print("=" * 80) print("STEP 7: DRAW CONCLUSIONS") print("=" * 80) print() print("Key Findings:") print("-" * 80) print(f" 1. We analyzed {total_strikes:,} bird strike incidents") if 'Bird Species' in data.columns: most_common_species = data['Bird Species'].value_counts().index[0] most_common_count = data['Bird Species'].value_counts().values[0] print(f" 2. Most common species: {most_common_species} ({most_common_count} incidents)") print() print("Implications:") print("-" * 80) print(" • This data can help airports implement targeted bird control measures") print(" • Understanding patterns helps improve aircraft safety procedures") print(" • Airlines can use this data for pilot training and awareness") print() # STEP 8: NEXT STEPS print("=" * 80) print("STEP 8: NEXT STEPS & RECOMMENDATIONS") print("=" * 80) print() print("What you could do next:") print(" • Analyze temporal patterns (time of day, season)") print(" • Investigate geographical patterns (airports, regions)") print(" • Correlate with aircraft types or flight phases") print(" • Build a predictive model for high-risk conditions") print(" • Create an interactive dashboard for stakeholders") print() # FINAL SUMMARY print("=" * 80) print("CONGRATULATIONS! YOU'VE COMPLETED A REAL DATA SCIENCE PROJECT!") print("=" * 80) print() print("You practiced the complete data science workflow:") print(" ✓ Step 1: Defined clear questions to answer") print(" ✓ Step 2: Loaded real-world data") print(" ✓ Step 3: Explored the data structure") print(" ✓ Step 4: Cleaned and prepared the data") print(" ✓ Step 5: Analyzed the data to find patterns") print(" ✓ Step 6: Visualized findings") print(" ✓ Step 7: Drew meaningful conclusions") print(" ✓ Step 8: Identified next steps") print() print("This is the same process data scientists use in real projects!") print() print("Your next challenge:") print(" • Try this workflow with other datasets in the data/ folder") print(" • Come up with your own questions and find answers") print(" • Share your findings with others") print("=" * 80)