From 22981456005d32d85dabcf4d527a205e8988252b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 3 Oct 2025 10:38:55 +0000 Subject: [PATCH] Add beginner-friendly examples with comprehensive documentation Co-authored-by: leestott <2511341+leestott@users.noreply.github.com> --- .gitignore | 5 + README.md | 16 ++ examples/01_hello_world_data_science.py | 87 ++++++++ examples/02_loading_data.py | 128 ++++++++++++ examples/03_simple_analysis.py | 174 ++++++++++++++++ examples/04_basic_visualization.py | 210 ++++++++++++++++++++ examples/05_real_world_example.py | 252 ++++++++++++++++++++++++ examples/README.md | 135 +++++++++++++ 8 files changed, 1007 insertions(+) create mode 100644 examples/01_hello_world_data_science.py create mode 100644 examples/02_loading_data.py create mode 100644 examples/03_simple_analysis.py create mode 100644 examples/04_basic_visualization.py create mode 100644 examples/05_real_world_example.py create mode 100644 examples/README.md diff --git a/.gitignore b/.gitignore index 87807e70..0e483527 100644 --- a/.gitignore +++ b/.gitignore @@ -353,3 +353,8 @@ MigrationBackup/ .ionide/ .vscode/settings.json +# Example output files (generated by running example scripts) +examples/*.png +examples/*.jpg +examples/*.jpeg + diff --git a/README.md b/README.md index cdffd920..959864f5 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,8 @@ Get started with the following resources: # Getting Started +> **Complete Beginners**: New to data science? Start with our [beginner-friendly examples](examples/README.md)! These simple, well-commented examples will help you understand the basics before diving into the full curriculum. + > **Teachers**: we have [included some suggestions](for-teachers.md) on how to use this curriculum. We'd love your feedback [in our discussion forum](https://github.com/microsoft/Data-Science-For-Beginners/discussions)! > **[Students](https://aka.ms/student-page)**: to use this curriculum on your own, fork the entire repo and complete the exercises on your own, starting with a pre-lecture quiz. Then read the lecture and complete the rest of the activities. Try to create the projects by comprehending the lessons rather than copying the solution code; however, that code is available in the /solutions folders in each project-oriented lesson. Another idea would be to form a study group with friends and go through the content together. For further study, we recommend [Microsoft Learn](https://docs.microsoft.com/en-us/users/jenlooper-2911/collections/qprpajyoy3x0g7?WT.mc_id=academic-77958-bethanycheum). @@ -86,6 +88,20 @@ In addition, a low-stakes quiz before a class sets the intention of the student > **A note about quizzes**: All quizzes are contained in the Quiz-App folder, for 40 total quizzes of three questions each. They are linked from within the lessons, but the quiz app can be run locally or deployed to Azure; follow the instruction in the `quiz-app` folder. They are gradually being localized. +## 🎓 Beginner-Friendly Examples + +**New to Data Science?** We've created a special [examples directory](examples/README.md) with simple, well-commented code to help you get started: + +- 🌟 **Hello World** - Your first data science program +- 📂 **Loading Data** - Learn to read and explore datasets +- 📊 **Simple Analysis** - Calculate statistics and find patterns +- 📈 **Basic Visualization** - Create charts and graphs +- 🔬 **Real-World Project** - Complete workflow from start to finish + +Each example includes detailed comments explaining every step, making it perfect for absolute beginners! + +👉 **[Start with the examples](examples/README.md)** 👈 + ## Lessons diff --git a/examples/01_hello_world_data_science.py b/examples/01_hello_world_data_science.py new file mode 100644 index 00000000..d44b1521 --- /dev/null +++ b/examples/01_hello_world_data_science.py @@ -0,0 +1,87 @@ +""" +Hello World - Data Science Style! + +This is your very first data science program. It introduces you to the basic +concepts of working with data in Python. + +What you'll learn: +- How to create a simple dataset +- How to display data +- How to work with Python lists and dictionaries +- Basic data manipulation + +Prerequisites: Just Python installed on your computer! +""" + +# Let's start with the classic "Hello, World!" but with a data science twist +print("=" * 50) +print("Hello, World of Data Science!") +print("=" * 50) +print() + +# In data science, we work with data. Let's create our first simple dataset. +# We'll use a list to store information about students and their test scores. + +# A list is a collection of items in Python, written with square brackets [] +students = ["Alice", "Bob", "Charlie", "Diana", "Eve"] +scores = [85, 92, 78, 95, 88] + +print("Our Dataset:") +print("-" * 50) +print("Students:", students) +print("Scores:", scores) +print() + +# Now let's do something useful with this data! +# We can find basic statistics about the scores + +# Find the highest score +highest_score = max(scores) +print(f"📊 Highest score: {highest_score}") + +# Find the lowest score +lowest_score = min(scores) +print(f"📊 Lowest score: {lowest_score}") + +# Calculate the average score +# sum() adds all numbers together, len() tells us how many items we have +average_score = sum(scores) / len(scores) +print(f"📊 Average score: {average_score:.2f}") # .2f means show 2 decimal places +print() + +# Let's find who got the highest score +# We use index() to find where the highest_score is in our list +top_student_index = scores.index(highest_score) +top_student = students[top_student_index] +print(f"🏆 Top student: {top_student} with a score of {highest_score}") +print() + +# Now let's organize this data in a more structured way +# We'll use a dictionary - it pairs keys (student names) with values (scores) +print("Student Scores (organized as key-value pairs):") +print("-" * 50) + +# Create a dictionary by pairing students with their scores +student_scores = {} +for i in range(len(students)): + student_scores[students[i]] = scores[i] + +# Display each student and their score +for student, score in student_scores.items(): + # Add a special marker for the top student + marker = "⭐" if student == top_student else " " + print(f"{marker} {student}: {score} points") + +print() +print("=" * 50) +print("Congratulations! You've completed your first data science program!") +print("=" * 50) + +# What did we just do? +# 1. Created a simple dataset (student names and scores) +# 2. Performed basic analysis (max, min, average) +# 3. Found insights (who is the top student) +# 4. Organized the data in a useful structure (dictionary) +# +# These are the fundamental building blocks of data science! +# Next, you'll learn to work with real datasets using powerful libraries. diff --git a/examples/02_loading_data.py b/examples/02_loading_data.py new file mode 100644 index 00000000..cbaf518c --- /dev/null +++ b/examples/02_loading_data.py @@ -0,0 +1,128 @@ +""" +Loading and Exploring Data + +In real data science projects, you'll work with data stored in files. +This example shows you how to load data from a CSV file and explore it. + +What you'll learn: +- How to load data from a CSV file +- How to view basic information about your dataset +- How to display the first/last rows +- How to get summary statistics + +Prerequisites: pandas library (install with: pip install pandas) +""" + +# Import the pandas library - it's the most popular tool for working with data in Python +# We give it the short name 'pd' so we can type less +import pandas as pd + +print("=" * 70) +print("Welcome to Data Loading and Exploration!") +print("=" * 70) +print() + +# Step 1: Load data from a CSV file +# CSV stands for "Comma-Separated Values" - a common format for storing data +# We'll use the birds dataset that comes with this repository +print("📂 Loading data from birds.csv...") +print() + +# Load the data into a DataFrame (think of it as a smart spreadsheet) +# A DataFrame is pandas' main data structure - it organizes data in rows and columns +data = pd.read_csv('../data/birds.csv') + +print("✅ Data loaded successfully!") +print() + +# Step 2: Get basic information about the dataset +print("-" * 70) +print("BASIC DATASET INFORMATION") +print("-" * 70) + +# How many rows and columns do we have? +num_rows, num_columns = data.shape +print(f"📊 Dataset size: {num_rows} rows × {num_columns} columns") +print() + +# What are the column names? +print("📋 Column names:") +for i, column in enumerate(data.columns, 1): + print(f" {i}. {column}") +print() + +# Step 3: Look at the first few rows of data +# This gives us a quick preview of what the data looks like +print("-" * 70) +print("FIRST 5 ROWS OF DATA (Preview)") +print("-" * 70) +print(data.head()) # head() shows the first 5 rows by default +print() + +# Step 4: Look at the last few rows +print("-" * 70) +print("LAST 3 ROWS OF DATA") +print("-" * 70) +print(data.tail(3)) # tail(3) shows the last 3 rows +print() + +# Step 5: Get information about data types +print("-" * 70) +print("DATA TYPES AND NON-NULL COUNTS") +print("-" * 70) +print(data.info()) # Shows column names, data types, and count of non-null values +print() + +# Step 6: Get statistical summary +print("-" * 70) +print("STATISTICAL SUMMARY (for numerical columns)") +print("-" * 70) +# describe() gives us statistics like mean, std, min, max, etc. +print(data.describe()) +print() + +# Step 7: Check for missing values +print("-" * 70) +print("MISSING VALUES CHECK") +print("-" * 70) +missing_values = data.isnull().sum() +print("Number of missing values per column:") +print(missing_values) +print() + +if missing_values.sum() == 0: + print("✅ Great! No missing values found.") +else: + print("⚠️ Some columns have missing values. You may need to handle them.") +print() + +# Step 8: Get unique values in a column +print("-" * 70) +print("SAMPLE: UNIQUE VALUES") +print("-" * 70) +# Let's see what unique values exist in the first column +first_column = data.columns[0] +unique_count = data[first_column].nunique() +print(f"The column '{first_column}' has {unique_count} unique value(s)") +print() + +# Summary +print("=" * 70) +print("SUMMARY") +print("=" * 70) +print("You've learned how to:") +print(" ✓ Load data from a CSV file using pandas") +print(" ✓ Check the size and shape of your dataset") +print(" ✓ View the first and last rows") +print(" ✓ Understand data types") +print(" ✓ Get statistical summaries") +print(" ✓ Check for missing values") +print() +print("Next step: Try loading other CSV files from the data/ folder!") +print("=" * 70) + +# Pro Tips: +# - Always explore your data before analyzing it +# - Check for missing values and understand why they might be missing +# - Look at the data types to ensure they make sense +# - Use head() and tail() to spot any obvious issues with your data diff --git a/examples/03_simple_analysis.py b/examples/03_simple_analysis.py new file mode 100644 index 00000000..23749d57 --- /dev/null +++ b/examples/03_simple_analysis.py @@ -0,0 +1,174 @@ +""" +Simple Data Analysis + +Learn how to analyze data and answer questions about it. +This example demonstrates common data analysis operations. + +What you'll learn: +- How to calculate statistics on your data +- How to filter data based on conditions +- How to group and aggregate data +- How to sort data + +Prerequisites: pandas library (install with: pip install pandas) +""" + +import pandas as pd + +print("=" * 70) +print("Simple Data Analysis Tutorial") +print("=" * 70) +print() + +# Load a dataset - we'll use the honey production data +print("📂 Loading honey production data...") +data = pd.read_csv('../data/honey.csv') +print("✅ Data loaded!\n") + +# Quick look at the data +print("-" * 70) +print("FIRST FEW ROWS") +print("-" * 70) +print(data.head(3)) +print() + +# SECTION 1: Basic Statistics +print("=" * 70) +print("SECTION 1: CALCULATING STATISTICS") +print("=" * 70) +print() + +# Let's look at the 'totalprod' column (total production) +if 'totalprod' in data.columns: + total_production = data['totalprod'] + + print("Total Honey Production Statistics:") + print("-" * 70) + print(f" Mean (Average): {total_production.mean():,.2f}") + print(f" Median (Middle): {total_production.median():,.2f}") + print(f" Mode (Most common): {total_production.mode().values[0]:,.2f}") + print(f" Std Dev: {total_production.std():,.2f}") + print(f" Minimum: {total_production.min():,.2f}") + print(f" Maximum: {total_production.max():,.2f}") + print() + +# SECTION 2: Filtering Data +print("=" * 70) +print("SECTION 2: FILTERING DATA") +print("=" * 70) +print() + +# Let's filter the data to show only records from a specific year +if 'year' in data.columns: + year_to_filter = 2000 + filtered_data = data[data['year'] == year_to_filter] + + print(f"Showing data for year {year_to_filter}:") + print("-" * 70) + print(f"Found {len(filtered_data)} records") + print() + print(filtered_data.head()) + print() + +# Filter based on multiple conditions +if 'totalprod' in data.columns and 'year' in data.columns: + # Find records where production was above 10 million pounds after 2010 + high_production = data[(data['totalprod'] > 10000000) & (data['year'] > 2010)] + + print("High production years (>10M pounds after 2010):") + print("-" * 70) + print(f"Found {len(high_production)} records") + print() + +# SECTION 3: Grouping and Aggregating +print("=" * 70) +print("SECTION 3: GROUPING AND AGGREGATING DATA") +print("=" * 70) +print() + +# Group by state and calculate average production +if 'state' in data.columns and 'totalprod' in data.columns: + # Group the data by state and calculate mean production + state_averages = data.groupby('state')['totalprod'].mean() + + # Sort to see which states have highest average production + state_averages_sorted = state_averages.sort_values(ascending=False) + + print("Top 10 States by Average Honey Production:") + print("-" * 70) + for i, (state, avg_prod) in enumerate(state_averages_sorted.head(10).items(), 1): + print(f"{i:2d}. {state:20s} {avg_prod:,.0f} pounds") + print() + +# SECTION 4: Sorting Data +print("=" * 70) +print("SECTION 4: SORTING DATA") +print("=" * 70) +print() + +if 'totalprod' in data.columns: + # Sort by total production in descending order + sorted_data = data.sort_values('totalprod', ascending=False) + + print("Records with Highest Production:") + print("-" * 70) + # Show the top 5 records + columns_to_show = ['state', 'year', 'totalprod'] if all(col in data.columns for col in ['state', 'year', 'totalprod']) else data.columns[:3] + print(sorted_data[columns_to_show].head()) + print() + +# SECTION 5: Counting Values +print("=" * 70) +print("SECTION 5: COUNTING VALUES") +print("=" * 70) +print() + +if 'state' in data.columns: + # Count how many records we have for each state + state_counts = data['state'].value_counts() + + print("Number of records per state (top 10):") + print("-" * 70) + for state, count in state_counts.head(10).items(): + print(f"{state:20s} {count:3d} records") + print() + +# SECTION 6: Answering a Question +print("=" * 70) +print("SECTION 6: ANSWERING A REAL QUESTION") +print("=" * 70) +print() + +# Question: Which state had the highest honey production in 2012? +if all(col in data.columns for col in ['state', 'year', 'totalprod']): + year_2012 = data[data['year'] == 2012] + + if len(year_2012) > 0: + # Find the row with maximum production in 2012 + max_prod_idx = year_2012['totalprod'].idxmax() + max_prod_state = year_2012.loc[max_prod_idx, 'state'] + max_prod_amount = year_2012.loc[max_prod_idx, 'totalprod'] + + print("Question: Which state had the highest honey production in 2012?") + print("-" * 70) + print(f"Answer: {max_prod_state}") + print(f"Production: {max_prod_amount:,.0f} pounds") + print() + +# Summary +print("=" * 70) +print("CONGRATULATIONS!") +print("=" * 70) +print("You've learned how to:") +print(" ✓ Calculate basic statistics (mean, median, mode, etc.)") +print(" ✓ Filter data based on conditions") +print(" ✓ Group data and calculate aggregates") +print(" ✓ Sort data to find top/bottom values") +print(" ✓ Count occurrences of values") +print(" ✓ Answer real questions using data") +print() +print("Try this yourself:") +print(" • Find the state with the lowest average production") +print(" • Calculate total production by year") +print(" • Find trends over time") +print("=" * 70) diff --git a/examples/04_basic_visualization.py b/examples/04_basic_visualization.py new file mode 100644 index 00000000..e37efd55 --- /dev/null +++ b/examples/04_basic_visualization.py @@ -0,0 +1,210 @@ +""" +Basic Data Visualization + +Learn how to create simple, effective visualizations to communicate your findings. +Visualizations help you and others understand data at a glance. + +What you'll learn: +- How to create bar charts +- How to create line plots +- How to create pie charts +- How to customize and save your visualizations + +Prerequisites: +- pandas library (install with: pip install pandas) +- matplotlib library (install with: pip install matplotlib) +""" + +import pandas as pd +import matplotlib.pyplot as plt + +print("=" * 70) +print("Basic Data Visualization Tutorial") +print("=" * 70) +print() + +# Load data +print("📂 Loading honey production data...") +data = pd.read_csv('../data/honey.csv') +print("✅ Data loaded!\n") + +# For better display, we'll use a subset of the data +# Let's focus on a few states in recent years +if 'state' in data.columns and 'year' in data.columns: + # Get data for a few states in recent years + states_to_show = ['CA', 'FL', 'ND', 'SD', 'MT'] + recent_data = data[(data['year'] >= 2010) & (data['state'].isin(states_to_show))] + +# VISUALIZATION 1: Bar Chart +print("=" * 70) +print("VISUALIZATION 1: BAR CHART") +print("=" * 70) +print() + +if 'state' in data.columns and 'totalprod' in data.columns: + # Calculate average production by state (for top 10 states) + state_avg = data.groupby('state')['totalprod'].mean().sort_values(ascending=False).head(10) + + print("Creating a bar chart of average honey production by state...") + print() + + # Create the figure and axis + plt.figure(figsize=(12, 6)) # Width: 12 inches, Height: 6 inches + + # Create the bar chart + plt.bar(state_avg.index, state_avg.values, color='gold', edgecolor='orange') + + # Add labels and title + plt.xlabel('State', fontsize=12) + plt.ylabel('Average Production (pounds)', fontsize=12) + plt.title('Top 10 States by Average Honey Production', fontsize=14, fontweight='bold') + + # Rotate x-axis labels for better readability + plt.xticks(rotation=45) + + # Add a grid for easier reading (behind the bars) + plt.grid(axis='y', alpha=0.3, linestyle='--') + + # Adjust layout to prevent label cutoff + plt.tight_layout() + + # Save the figure + plt.savefig('bar_chart_example.png', dpi=300, bbox_inches='tight') + print("✅ Bar chart saved as 'bar_chart_example.png'") + plt.close() # Close to free memory + print() + +# VISUALIZATION 2: Line Plot +print("=" * 70) +print("VISUALIZATION 2: LINE PLOT") +print("=" * 70) +print() + +if 'year' in data.columns and 'totalprod' in data.columns: + # Calculate total production by year + yearly_production = data.groupby('year')['totalprod'].sum() + + print("Creating a line plot of honey production over time...") + print() + + plt.figure(figsize=(12, 6)) + + # Create the line plot + plt.plot(yearly_production.index, yearly_production.values, + marker='o', # Add circular markers at each data point + linewidth=2, # Line thickness + color='darkorange', # Line color + markersize=6, # Size of markers + markerfacecolor='gold') # Fill color of markers + + # Add labels and title + plt.xlabel('Year', fontsize=12) + plt.ylabel('Total Production (pounds)', fontsize=12) + plt.title('Honey Production Over Time', fontsize=14, fontweight='bold') + + # Add a grid + plt.grid(True, alpha=0.3, linestyle='--') + + plt.tight_layout() + plt.savefig('line_plot_example.png', dpi=300, bbox_inches='tight') + print("✅ Line plot saved as 'line_plot_example.png'") + plt.close() + print() + +# VISUALIZATION 3: Pie Chart +print("=" * 70) +print("VISUALIZATION 3: PIE CHART") +print("=" * 70) +print() + +if 'state' in data.columns and 'totalprod' in data.columns: + # Get total production for top 5 states + top5_states = data.groupby('state')['totalprod'].sum().sort_values(ascending=False).head(5) + + print("Creating a pie chart of production share (top 5 states)...") + print() + + plt.figure(figsize=(10, 8)) + + # Create the pie chart + colors = ['gold', 'orange', 'lightsalmon', 'lightcoral', 'peachpuff'] + plt.pie(top5_states.values, + labels=top5_states.index, # State names + autopct='%1.1f%%', # Show percentages + startangle=90, # Start from top + colors=colors, + explode=(0.1, 0, 0, 0, 0)) # Slightly separate the first slice + + plt.title('Top 5 States Share of Total Honey Production', + fontsize=14, fontweight='bold', pad=20) + + plt.savefig('pie_chart_example.png', dpi=300, bbox_inches='tight') + print("✅ Pie chart saved as 'pie_chart_example.png'") + plt.close() + print() + +# VISUALIZATION 4: Multiple Lines on One Plot +print("=" * 70) +print("VISUALIZATION 4: COMPARING MULTIPLE SERIES") +print("=" * 70) +print() + +if 'year' in data.columns and 'totalprod' in data.columns and 'state' in data.columns: + # Compare production trends for a few states + states_to_compare = ['CA', 'ND', 'SD'] + + print(f"Creating a comparison plot for states: {', '.join(states_to_compare)}...") + print() + + plt.figure(figsize=(12, 6)) + + # Plot a line for each state + colors_map = {'CA': 'blue', 'ND': 'green', 'SD': 'red'} + + for state in states_to_compare: + if state in data['state'].values: + state_data = data[data['state'] == state].groupby('year')['totalprod'].sum() + plt.plot(state_data.index, state_data.values, + marker='o', + label=state, # This will appear in the legend + linewidth=2, + color=colors_map.get(state, 'gray')) + + plt.xlabel('Year', fontsize=12) + plt.ylabel('Total Production (pounds)', fontsize=12) + plt.title('Honey Production Comparison by State', fontsize=14, fontweight='bold') + plt.legend(title='State') # Add a legend + plt.grid(True, alpha=0.3, linestyle='--') + + plt.tight_layout() + plt.savefig('comparison_plot_example.png', dpi=300, bbox_inches='tight') + print("✅ Comparison plot saved as 'comparison_plot_example.png'") + plt.close() + print() + +# Summary +print("=" * 70) +print("CONGRATULATIONS!") +print("=" * 70) +print("You've learned how to:") +print(" ✓ Create bar charts to compare categories") +print(" ✓ Create line plots to show trends over time") +print(" ✓ Create pie charts to show proportions") +print(" ✓ Plot multiple data series on one chart") +print(" ✓ Customize colors, labels, and titles") +print(" ✓ Save your visualizations as image files") +print() +print("Your visualizations have been saved in the examples/ folder!") +print() +print("Try this yourself:") +print(" • Change the colors of your charts") +print(" • Add more states to the comparison plot") +print(" • Create a horizontal bar chart") +print(" • Experiment with different chart styles") +print() +print("Pro tip: Always choose the right chart type for your data:") +print(" • Bar charts: Compare categories") +print(" • Line plots: Show trends over time") +print(" • Pie charts: Show parts of a whole") +print(" • Scatter plots: Show relationships between variables") +print("=" * 70) diff --git a/examples/05_real_world_example.py b/examples/05_real_world_example.py new file mode 100644 index 00000000..5035369a --- /dev/null +++ b/examples/05_real_world_example.py @@ -0,0 +1,252 @@ +""" +Real World Example: Complete Data Science Workflow + +This example ties everything together, showing you a complete data science project +from start to finish. We'll analyze bird strike data to answer real questions. + +What you'll learn: +- How to approach a data science problem +- Complete workflow: Load → Clean → Analyze → Visualize → Conclude +- How to handle real-world data issues +- How to draw meaningful conclusions + +Prerequisites: +- pandas library (install with: pip install pandas) +- matplotlib library (install with: pip install matplotlib) + +Real-world context: +Bird strikes (when birds collide with aircraft) are a safety concern for aviation. +Let's analyze bird strike data to understand patterns and risks. +""" + +import pandas as pd +import matplotlib.pyplot as plt + +print("=" * 80) +print("REAL WORLD DATA SCIENCE PROJECT: BIRD STRIKE ANALYSIS") +print("=" * 80) +print() + +# STEP 1: DEFINE THE PROBLEM +print("=" * 80) +print("STEP 1: DEFINE THE PROBLEM") +print("=" * 80) +print() +print("Questions we want to answer:") +print(" 1. How many bird strikes occur?") +print(" 2. When do bird strikes most commonly occur?") +print(" 3. What are the most common bird species involved?") +print(" 4. What is the typical damage level?") +print() +input("Press Enter to continue...") +print() + +# STEP 2: LOAD THE DATA +print("=" * 80) +print("STEP 2: LOAD THE DATA") +print("=" * 80) +print() +print("📂 Loading bird strike data...") + +try: + data = pd.read_csv('../data/birds.csv') + print(f"✅ Successfully loaded {len(data)} records") + print() +except FileNotFoundError: + print("❌ Error: birds.csv not found in data/ folder") + print("Please make sure the data file exists.") + exit(1) + +# STEP 3: EXPLORE THE DATA +print("=" * 80) +print("STEP 3: EXPLORE THE DATA") +print("=" * 80) +print() + +print("Dataset Information:") +print("-" * 80) +print(f" • Shape: {data.shape[0]} rows × {data.shape[1]} columns") +print(f" • Columns: {', '.join(data.columns.tolist())}") +print() + +print("First few rows:") +print("-" * 80) +print(data.head(3)) +print() + +print("Data types:") +print("-" * 80) +print(data.dtypes) +print() + +# STEP 4: CLEAN THE DATA +print("=" * 80) +print("STEP 4: CLEAN THE DATA") +print("=" * 80) +print() + +# Check for missing values +print("Checking for missing values...") +missing_counts = data.isnull().sum() +missing_percentage = (missing_counts / len(data)) * 100 + +print("-" * 80) +for column in data.columns: + if missing_counts[column] > 0: + print(f" • {column}: {missing_counts[column]} missing ({missing_percentage[column]:.1f}%)") + +if missing_counts.sum() == 0: + print(" ✅ No missing values found!") +print() + +# Handle missing values in a specific column if needed +# For this example, we'll work with the data as-is, but in real projects +# you might need to fill or remove missing values + +print("Data cleaning notes:") +print("-" * 80) +print(" • In a real project, you would:") +print(" - Decide how to handle missing values (remove, fill, or keep)") +print(" - Check for duplicate records") +print(" - Validate data types") +print(" - Look for outliers or incorrect values") +print(" • For this example, we'll proceed with the data as-is") +print() + +# STEP 5: ANALYZE THE DATA +print("=" * 80) +print("STEP 5: ANALYZE THE DATA") +print("=" * 80) +print() + +# Analysis 1: Total number of incidents +total_strikes = len(data) +print("Analysis 1: Overview") +print("-" * 80) +print(f" • Total bird strikes recorded: {total_strikes:,}") +print() + +# Analysis 2: Find the most common bird species +if 'Bird Species' in data.columns: + print("Analysis 2: Most Common Bird Species") + print("-" * 80) + top_species = data['Bird Species'].value_counts().head(5) + for i, (species, count) in enumerate(top_species.items(), 1): + print(f" {i}. {species}: {count} strikes ({count/total_strikes*100:.1f}%)") + print() + +# Analysis 3: Analyze by time (if time column exists) +time_column = None +for col in ['FlightDate', 'Date', 'Time']: + if col in data.columns: + time_column = col + break + +if time_column: + print(f"Analysis 3: Temporal Analysis") + print("-" * 80) + print(f" • Using column: {time_column}") + # Additional time analysis could go here + print() + +# STEP 6: VISUALIZE THE DATA +print("=" * 80) +print("STEP 6: VISUALIZE THE DATA") +print("=" * 80) +print() + +# Visualization 1: Top bird species +if 'Bird Species' in data.columns: + print("Creating visualization 1: Top 10 Bird Species...") + top_10_species = data['Bird Species'].value_counts().head(10) + + plt.figure(figsize=(12, 6)) + plt.barh(range(len(top_10_species)), top_10_species.values, color='steelblue') + plt.yticks(range(len(top_10_species)), top_10_species.index) + plt.xlabel('Number of Strikes', fontsize=12) + plt.ylabel('Bird Species', fontsize=12) + plt.title('Top 10 Bird Species Involved in Strikes', fontsize=14, fontweight='bold') + plt.grid(axis='x', alpha=0.3) + plt.tight_layout() + plt.savefig('birds_top_species.png', dpi=300, bbox_inches='tight') + plt.close() + print(" ✅ Saved as 'birds_top_species.png'") + print() + +# Visualization 2: Distribution of another variable +# Check what columns are available for interesting visualizations +numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns +if len(numeric_columns) > 0: + print("Creating visualization 2: Numeric data distribution...") + col_to_plot = numeric_columns[0] + + plt.figure(figsize=(10, 6)) + data[col_to_plot].hist(bins=30, color='teal', edgecolor='black', alpha=0.7) + plt.xlabel(col_to_plot, fontsize=12) + plt.ylabel('Frequency', fontsize=12) + plt.title(f'Distribution of {col_to_plot}', fontsize=14, fontweight='bold') + plt.grid(axis='y', alpha=0.3) + plt.tight_layout() + plt.savefig('birds_distribution.png', dpi=300, bbox_inches='tight') + plt.close() + print(f" ✅ Saved as 'birds_distribution.png'") + print() + +# STEP 7: DRAW CONCLUSIONS +print("=" * 80) +print("STEP 7: DRAW CONCLUSIONS") +print("=" * 80) +print() + +print("Key Findings:") +print("-" * 80) +print(f" 1. We analyzed {total_strikes:,} bird strike incidents") + +if 'Bird Species' in data.columns: + most_common_species = data['Bird Species'].value_counts().index[0] + most_common_count = data['Bird Species'].value_counts().values[0] + print(f" 2. Most common species: {most_common_species} ({most_common_count} incidents)") + +print() +print("Implications:") +print("-" * 80) +print(" • This data can help airports implement targeted bird control measures") +print(" • Understanding patterns helps improve aircraft safety procedures") +print(" • Airlines can use this data for pilot training and awareness") +print() + +# STEP 8: NEXT STEPS +print("=" * 80) +print("STEP 8: NEXT STEPS & RECOMMENDATIONS") +print("=" * 80) +print() +print("What you could do next:") +print(" • Analyze temporal patterns (time of day, season)") +print(" • Investigate geographical patterns (airports, regions)") +print(" • Correlate with aircraft types or flight phases") +print(" • Build a predictive model for high-risk conditions") +print(" • Create an interactive dashboard for stakeholders") +print() + +# FINAL SUMMARY +print("=" * 80) +print("CONGRATULATIONS! YOU'VE COMPLETED A REAL DATA SCIENCE PROJECT!") +print("=" * 80) +print() +print("You practiced the complete data science workflow:") +print(" ✓ Step 1: Defined clear questions to answer") +print(" ✓ Step 2: Loaded real-world data") +print(" ✓ Step 3: Explored the data structure") +print(" ✓ Step 4: Cleaned and prepared the data") +print(" ✓ Step 5: Analyzed the data to find patterns") +print(" ✓ Step 6: Visualized findings") +print(" ✓ Step 7: Drew meaningful conclusions") +print(" ✓ Step 8: Identified next steps") +print() +print("This is the same process data scientists use in real projects!") +print() +print("Your next challenge:") +print(" • Try this workflow with other datasets in the data/ folder") +print(" • Come up with your own questions and find answers") +print(" • Share your findings with others") +print("=" * 80) diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 00000000..44455523 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,135 @@ +# Beginner-Friendly Data Science Examples + +Welcome to the examples directory! This collection of simple, well-commented examples is designed to help you get started with data science, even if you're a complete beginner. + +## 📚 What You'll Find Here + +Each example is self-contained and includes: +- **Clear comments** explaining every step +- **Simple, readable code** that demonstrates one concept at a time +- **Real-world context** to help you understand when and why to use these techniques +- **Expected output** so you know what to look for + +## 🚀 Getting Started + +### Prerequisites +Before running these examples, make sure you have: +- Python 3.7 or higher installed +- Basic understanding of how to run Python scripts + +### Installing Required Libraries +```bash +pip install pandas numpy matplotlib +``` + +## 📖 Examples Overview + +### 1. Hello World - Data Science Style +**File:** `01_hello_world_data_science.py` + +Your first data science program! Learn how to: +- Load a simple dataset +- Display basic information about your data +- Print your first data science output + +Perfect for absolute beginners who want to see their first data science program in action. + +--- + +### 2. Loading and Exploring Data +**File:** `02_loading_data.py` + +Learn the fundamentals of working with data: +- Read data from CSV files +- View the first few rows of your dataset +- Get basic statistics about your data +- Understand data types + +This is often the first step in any data science project! + +--- + +### 3. Simple Data Analysis +**File:** `03_simple_analysis.py` + +Perform your first data analysis: +- Calculate basic statistics (mean, median, mode) +- Find maximum and minimum values +- Count occurrences of values +- Filter data based on conditions + +See how to answer simple questions about your data. + +--- + +### 4. Data Visualization Basics +**File:** `04_basic_visualization.py` + +Create your first visualizations: +- Make a simple bar chart +- Create a line plot +- Generate a pie chart +- Save your visualizations as images + +Learn to communicate your findings visually! + +--- + +### 5. Working with Real Data +**File:** `05_real_world_example.py` + +Put it all together with a complete example: +- Load real data from the repository +- Clean and prepare the data +- Perform analysis +- Create meaningful visualizations +- Draw conclusions + +This example shows you a complete workflow from start to finish. + +--- + +## 🎯 How to Use These Examples + +1. **Start from the beginning**: The examples are numbered in order of difficulty. Begin with `01_hello_world_data_science.py` and work your way through. + +2. **Read the comments**: Each file has detailed comments explaining what the code does and why. Read them carefully! + +3. **Experiment**: Try modifying the code. What happens if you change a value? Break things and fix them - that's how you learn! + +4. **Run the code**: Execute each example and observe the output. Compare it with what you expected. + +5. **Build on it**: Once you understand an example, try extending it with your own ideas. + +## 💡 Tips for Beginners + +- **Don't rush**: Take time to understand each example before moving to the next one +- **Type the code yourself**: Don't just copy-paste. Typing helps you learn and remember +- **Look up unfamiliar concepts**: If you see something you don't understand, search for it online or in the main lessons +- **Ask questions**: Join the [discussion forum](https://github.com/microsoft/Data-Science-For-Beginners/discussions) if you need help +- **Practice regularly**: Try to code a little bit every day rather than long sessions once a week + +## 🔗 Next Steps + +After completing these examples, you're ready to: +- Work through the main curriculum lessons +- Try the assignments in each lesson folder +- Explore the Jupyter notebooks for more in-depth learning +- Create your own data science projects + +## 📚 Additional Resources + +- [Main Curriculum](../README.md) - The complete 20-lesson course +- [For Teachers](../for-teachers.md) - Using this curriculum in your classroom +- [Microsoft Learn](https://docs.microsoft.com/learn/) - Free online learning resources +- [Python Documentation](https://docs.python.org/3/) - Official Python reference + +## 🤝 Contributing + +Found a bug or have an idea for a new example? We welcome contributions! Please see our [Contributing Guide](../CONTRIBUTING.md). + +--- + +**Happy Learning! 🎉** + +Remember: Every expert was once a beginner. Take it one step at a time, and don't be afraid to make mistakes - they're part of the learning process!