Add lifestyle data analysis script

2 months ago · 3821bbde5a
parent 59763a1a89
commit 3821bbde5a
1 changed files with 112 additions and 0 deletions
--- a/112
+++ b/112
@ -0,0 +1,112 @@
+"Based on the correlation matrix, what is the strength and direction of the linear relationship between sleep duration and mood score in this dataset?"
+# 1. Import necessary libraries
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Set visualization style
+sns.set_style('darkgrid')
+plt.rcParams['figure.figsize'] = (14, 7)
+
+# --- 2. Data Loading ---
+# NOTE: Replace 'lifestyle_data.csv' with your actual file path.
+# Assuming a dataset with columns: Date, Steps, CaloriesBurned, Distance, SleepDuration (hours)
+try:
+    df = pd.read_csv('lifestyle_data.csv')
+    print("Life Style data successfully loaded!")
+except FileNotFoundError:
+    print("Error: Make sure 'lifestyle_data.csv' is in the correct directory.")
+    print("Creating a dummy DataFrame for demonstration.")
+    # Create a minimal dummy DataFrame for structural demonstration if loading fails
+    data = {
+        'Date': pd.to_datetime(pd.date_range(start='2024-01-01', periods=30, freq='D')),
+        'Steps': np.random.randint(3000, 15000, 30),
+        'CaloriesBurned': np.random.randint(500, 2000, 30),
+        'Distance': np.round(np.random.uniform(2.0, 10.0, 30), 2),
+        'SleepDuration': np.round(np.random.uniform(5.5, 9.0, 30), 1),
+        'MoodScore': np.random.randint(1, 11, 30) # 1=Bad, 10=Excellent
+    }
+    df = pd.DataFrame(data)
+
+# Initial Data Exploration
+print("\n--- Initial Data Info ---")
+print(df.head())
+print(df.info())
+
+
+# --- 3. Data Cleaning and Preprocessing ---
+
+# 3.1. Convert 'Date' column to datetime objects
+if 'Date' in df.columns and df['Date'].dtype != '<M8[ns]': # Check if it's already datetime
+    df['Date'] = pd.to_datetime(df['Date'])
+
+# 3.2. Set 'Date' as index for time-series analysis
+df.set_index('Date', inplace=True)
+
+# 3.3. Check for Outliers (Simple check on a key metric)
+print(f"\nSteps - Basic Statistics:\n{df['Steps'].describe()}")
+# You might apply Z-score or IQR methods here for formal outlier removal
+
+
+# --- 4. Exploratory Data Analysis (EDA) & Insights ---
+
+# 4.1. Overall Trends Over Time
+print("\n--- 4.1 Weekly Averages ---")
+weekly_summary = df[['Steps', 'SleepDuration']].resample('W').mean()
+print(weekly_summary.head())
+
+# Time Series Plot for Steps and Sleep (Using Secondary Y-axis)
+fig, ax1 = plt.subplots(figsize=(14, 7))
+
+# Plot Steps on Primary Axis
+color = 'tab:blue'
+ax1.set_xlabel('Date')
+ax1.set_ylabel('Weekly Average Steps', color=color)
+ax1.plot(weekly_summary.index, weekly_summary['Steps'], color=color, marker='o')
+ax1.tick_params(axis='y', labelcolor=color)
+
+# Create a secondary axis for Sleep
+ax2 = ax1.twinx()
+color = 'tab:red'
+ax2.set_ylabel('Weekly Average Sleep Duration (hours)', color=color)
+ax2.plot(weekly_summary.index, weekly_summary['SleepDuration'], color=color, marker='x')
+ax2.tick_params(axis='y', labelcolor=color)
+
+plt.title('Weekly Trends: Steps vs. Sleep Duration ')
+fig.tight_layout()
+plt.show()
+
+# 4.2. Relationship Analysis: Steps vs. Calories Burned
+correlation_steps_calories = df['Steps'].corr(df['CaloriesBurned'])
+print(f"\nCorrelation between Steps and Calories Burned: {correlation_steps_calories:.2f}")
+
+plt.figure(figsize=(8, 6))
+sns.scatterplot(x='Steps', y='CaloriesBurned', data=df)
+plt.title('Relationship between Daily Steps and Calories Burned')
+plt.show()
+
+# 4.3. Correlation Matrix (Identifying Key Relationships)
+# Select the numeric columns for correlation analysis
+numeric_df = df[['Steps', 'CaloriesBurned', 'Distance', 'SleepDuration', 'MoodScore']]
+correlation_matrix = numeric_df.corr()
+
+plt.figure(figsize=(10, 8))
+sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
+plt.title('Correlation Matrix of Life Style Metrics ')
+plt.show()
+
+# 4.4. Day of the Week Analysis (When are we most active?)
+df['DayOfWeek'] = df.index.day_name()
+day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
+daily_activity = df.groupby('DayOfWeek')['Steps'].mean().reindex(day_order)
+
+print("\n--- 4.4 Average Steps by Day of the Week ---")
+print(daily_activity)
+
+plt.figure(figsize=(10, 6))
+sns.barplot(x=daily_activity.index, y=daily_activity.values, palette='viridis')
+plt.title('Average Daily Steps by Day of the Week')
+plt.ylabel('Average Steps')
+plt.xlabel('Day of Week')
+plt.show()