4-analyze_health

This page shows the source code for 4-analyze_health.py in browser-friendly HTML format. It was generated automatically from the original Python file.

Source File 4-analyze_health.py
Folder Chapter-4-Datasets-Medium
"""
Health & Fitness Analysis
Demonstrates: regression analysis, advanced correlations, health metrics visualization
"""

import sys
if hasattr(sys.stdout, 'reconfigure'):  # Check condition
    sys.stdout.reconfigure(encoding='utf-8')
if hasattr(sys.stderr, 'reconfigure'):  # Check condition
    sys.stderr.reconfigure(encoding='utf-8')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv('dataset4_health.csv')

# Calculate additional metrics
df['Calorie_Balance'] = df['Calories_Consumed'] - df['Calories_Burned']
df['Exercise_Category'] = pd.cut(df['Exercise_Minutes_Daily'], 
                                 bins=[0, 30, 45, 60, 100],
                                 labels=['Low', 'Moderate', 'High', 'Very High'])

print("=" * 60)
print("HEALTH & FITNESS ANALYSIS")
print("=" * 60)
print("\nDataset Overview:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

# Create comprehensive visualization
fig = plt.figure(figsize=(18, 14))

# 1. Comprehensive Correlation Heatmap
plt.subplot(4, 3, 1)
health_metrics = ['Age', 'BMI', 'Exercise_Minutes_Daily', 'Steps_Daily', 
                 'Sleep_Hours', 'Water_Intake_Liters', 'Heart_Rate_Avg', 
                 'Blood_Pressure_Sys', 'Stress_Level']
corr_matrix = df[health_metrics].corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdYlGn_r', 
           center=0, square=True, linewidths=1, cbar_kws={'label': 'Correlation'})
plt.title('Health Metrics Correlation Matrix', fontsize=12, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

# 2. Exercise vs Calories Burned
plt.subplot(4, 3, 2)
plt.scatter(df['Exercise_Minutes_Daily'], df['Calories_Burned'], 
           c=df['Age'], cmap='viridis', alpha=0.6, s=100)
plt.colorbar(label='Age')
# Linear regression
X = df['Exercise_Minutes_Daily'].values.reshape(-1, 1)
y = df['Calories_Burned'].values
model = LinearRegression()
model.fit(X, y)
plt.plot(df['Exercise_Minutes_Daily'].sort_values(), 
        model.predict(df['Exercise_Minutes_Daily'].sort_values().values.reshape(-1, 1)),
        'r--', linewidth=2, label=f'R² = {model.score(X, y):.3f}')
plt.xlabel('Exercise Minutes Daily')
plt.ylabel('Calories Burned')
plt.title('Exercise vs Calories Burned', fontsize=12, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

# 3. BMI Distribution
plt.subplot(4, 3, 3)
bmi_categories = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, 40], 
                       labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
bmi_counts = bmi_categories.value_counts()
colors_bmi = ['#74B9FF', '#55EFC4', '#FDCB6E', '#FF7675']
plt.pie(bmi_counts.values, labels=bmi_counts.index, autopct='%1.1f%%', 
       colors=colors_bmi, startangle=90)
plt.title('BMI Distribution', fontsize=12, fontweight='bold')

# 4. Steps vs Heart Rate
plt.subplot(4, 3, 4)
for gender in df['Gender'].unique():
    gender_data = df[df['Gender'] == gender]
    plt.scatter(gender_data['Steps_Daily'], gender_data['Heart_Rate_Avg'],
               label=gender, alpha=0.6, s=80)
plt.xlabel('Steps Daily')
plt.ylabel('Average Heart Rate')
plt.title('Daily Steps vs Heart Rate by Gender', fontsize=12, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)

# 5. Sleep Hours Impact on Stress
plt.subplot(4, 3, 5)
sleep_stress = df.groupby('Sleep_Hours')['Stress_Level'].mean()
plt.plot(sleep_stress.index, sleep_stress.values, marker='o', 
        linewidth=2, markersize=10, color='#6C5CE7')
plt.fill_between(sleep_stress.index, sleep_stress.values, alpha=0.3, color='#6C5CE7')
plt.xlabel('Sleep Hours')
plt.ylabel('Average Stress Level')
plt.title('Sleep Duration vs Stress Level', fontsize=12, fontweight='bold')
plt.grid(True, alpha=0.3)

# 6. Activity Type Comparison
plt.subplot(4, 3, 6)
activity_metrics = df.groupby('Activity_Type')[['Calories_Burned', 'Steps_Daily']].mean()
x = np.arange(len(activity_metrics))
width = 0.35
ax = plt.gca()
ax2 = ax.twinx()
bars1 = ax.bar(x - width/2, activity_metrics['Calories_Burned'], width, 
              label='Calories Burned', color='#FF6B6B')
bars2 = ax2.bar(x + width/2, activity_metrics['Steps_Daily'], width, 
               label='Steps Daily', color='#4ECDC4')
ax.set_xlabel('Activity Type')
ax.set_ylabel('Calories Burned', color='#FF6B6B')
ax2.set_ylabel('Steps Daily', color='#4ECDC4')
ax.set_xticks(x)
ax.set_xticklabels(activity_metrics.index, rotation=45, ha='right')
plt.title('Activity Type Performance', fontsize=12, fontweight='bold')
ax.legend(loc='upper left')
ax2.legend(loc='upper right')

# 7. Water Intake vs Performance
plt.subplot(4, 3, 7)
water_groups = pd.cut(df['Water_Intake_Liters'], bins=[0, 2, 2.5, 3, 4], 
                     labels=['Low (<2L)', 'Moderate (2-2.5L)', 'Good (2.5-3L)', 'Excellent (>3L)'])
water_perf = df.groupby(water_groups)['Calories_Burned'].mean()
colors_water = ['#FF6B6B', '#FFA07A', '#98D8C8', '#6BCB77']
plt.bar(range(len(water_perf)), water_perf.values, color=colors_water)
plt.xticks(range(len(water_perf)), water_perf.index, rotation=45, ha='right')
plt.ylabel('Average Calories Burned')
plt.title('Hydration Impact on Performance', fontsize=12, fontweight='bold')
plt.grid(axis='y', alpha=0.3)

# 8. Blood Pressure Analysis
plt.subplot(4, 3, 8)
exercise_bp = df.groupby('Exercise_Category')[['Blood_Pressure_Sys', 'Blood_Pressure_Dia']].mean()
x = np.arange(len(exercise_bp))
width = 0.35
plt.bar(x - width/2, exercise_bp['Blood_Pressure_Sys'], width, 
       label='Systolic', color='#E74C3C')
plt.bar(x + width/2, exercise_bp['Blood_Pressure_Dia'], width, 
       label='Diastolic', color='#3498DB')
plt.xlabel('Exercise Level')
plt.ylabel('Blood Pressure (mmHg)')
plt.title('Exercise Level vs Blood Pressure', fontsize=12, fontweight='bold')
plt.xticks(x, exercise_bp.index)
plt.legend()
plt.grid(axis='y', alpha=0.3)

# 9. Age vs BMI with Exercise Overlay
plt.subplot(4, 3, 9)
scatter = plt.scatter(df['Age'], df['BMI'], c=df['Exercise_Minutes_Daily'], 
                     cmap='RdYlGn', alpha=0.6, s=100)
plt.colorbar(scatter, label='Exercise Minutes')
plt.xlabel('Age')
plt.ylabel('BMI')
plt.title('Age vs BMI (colored by Exercise)', fontsize=12, fontweight='bold')
plt.axhline(y=25, color='orange', linestyle='--', label='Overweight threshold')
plt.legend()
plt.grid(True, alpha=0.3)

# 10. Calorie Balance Distribution
plt.subplot(4, 3, 10)
plt.hist(df['Calorie_Balance'], bins=30, color='#95E1D3', edgecolor='black', alpha=0.7)
plt.axvline(0, color='red', linestyle='--', linewidth=2, label='Balanced')
plt.axvline(df['Calorie_Balance'].mean(), color='blue', linestyle='--', 
           linewidth=2, label=f'Mean: {df["Calorie_Balance"].mean():.0f}')
plt.xlabel('Calorie Balance (Consumed - Burned)')
plt.ylabel('Frequency')
plt.title('Calorie Balance Distribution', fontsize=12, fontweight='bold')
plt.legend()

# 11. Multi-metric Performance by Gender
plt.subplot(4, 3, 11)
gender_metrics = df.groupby('Gender')[['Exercise_Minutes_Daily', 'Steps_Daily', 
                                       'Sleep_Hours', 'Water_Intake_Liters']].mean()
# Normalize for comparison
scaler = StandardScaler()
gender_metrics_norm = pd.DataFrame(
    scaler.fit_transform(gender_metrics),
    index=gender_metrics.index,
    columns=gender_metrics.columns
)
gender_metrics_norm.T.plot(kind='bar', ax=plt.gca(), width=0.7)
plt.ylabel('Normalized Score')
plt.title('Health Metrics Comparison by Gender', fontsize=12, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Gender')
plt.grid(axis='y', alpha=0.3)

# 12. Stress Level Analysis
plt.subplot(4, 3, 12)
stress_factors = df.groupby('Stress_Level')[['Exercise_Minutes_Daily', 
                                              'Sleep_Hours', 'BMI']].mean()
stress_factors_norm = pd.DataFrame(
    scaler.fit_transform(stress_factors),
    index=stress_factors.index,
    columns=stress_factors.columns
)
stress_factors_norm.plot(marker='o', linewidth=2, ax=plt.gca())
plt.xlabel('Stress Level (1-7)')
plt.ylabel('Normalized Score')
plt.title('Factors Contributing to Stress', fontsize=12, fontweight='bold')
plt.legend(title='Metric')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('health_analysis_dashboard.png', dpi=300, bbox_inches='tight')
print("\n[OK] Dashboard saved as 'health_analysis_dashboard.png'")
plt.show()

# Statistical Analysis
print("\n" + "=" * 60)
print("HEALTH INSIGHTS & CORRELATIONS")
print("=" * 60)

# Key correlations
print("\n1. KEY CORRELATIONS WITH CALORIES BURNED:")
calorie_corr = df[['Exercise_Minutes_Daily', 'Steps_Daily', 'Sleep_Hours', 
                   'Water_Intake_Liters']].corrwith(df['Calories_Burned']).sort_values(ascending=False)
for metric, corr in calorie_corr.items():
    print(f"   {metric}: {corr:.3f}")

# Exercise impact
print("\n2. EXERCISE LEVEL IMPACT:")
for level in ['Low', 'Moderate', 'High', 'Very High']:
    level_data = df[df['Exercise_Category'] == level]
    print(f"   {level}: Avg BMI {level_data['BMI'].mean():.1f}, "
          f"Avg Stress {level_data['Stress_Level'].mean():.1f}, "
          f"Avg Heart Rate {level_data['Heart_Rate_Avg'].mean():.1f}")

# Gender comparison
print("\n3. GENDER HEALTH METRICS:")
gender_stats = df.groupby('Gender')[['BMI', 'Exercise_Minutes_Daily', 
                                     'Sleep_Hours', 'Stress_Level']].mean()
print(gender_stats)

# Optimal ranges
print("\n4. OPTIMAL HEALTH PROFILE (Top 25% Performers):")
top_performers = df.nlargest(25, 'Calories_Burned')
print(f"   Exercise: {top_performers['Exercise_Minutes_Daily'].mean():.1f} min/day")
print(f"   Steps: {top_performers['Steps_Daily'].mean():.0f} steps/day")
print(f"   Sleep: {top_performers['Sleep_Hours'].mean():.1f} hours")
print(f"   Water: {top_performers['Water_Intake_Liters'].mean():.1f} liters")
print(f"   Average BMI: {top_performers['BMI'].mean():.1f}")
print(f"   Average Stress: {top_performers['Stress_Level'].mean():.1f}")

# Activity type effectiveness
print("\n5. MOST EFFECTIVE ACTIVITY TYPES:")
activity_effectiveness = df.groupby('Activity_Type').agg({
    'Calories_Burned': 'mean',
    'Steps_Daily': 'mean',
    'Stress_Level': 'mean'
}).round(1)
print(activity_effectiveness.sort_values('Calories_Burned', ascending=False))

# Stress analysis
print("\n6. LOW STRESS INDIVIDUALS (<3) CHARACTERISTICS:")
low_stress = df[df['Stress_Level'] < 3]
print(f"   Average Exercise: {low_stress['Exercise_Minutes_Daily'].mean():.1f} min")
print(f"   Average Sleep: {low_stress['Sleep_Hours'].mean():.1f} hours")
print(f"   Average BMI: {low_stress['BMI'].mean():.1f}")
print(f"   Percentage with High Activity: {(low_stress['Exercise_Minutes_Daily'] > 45).sum() / len(low_stress) * 100:.1f}%")

print("\n" + "=" * 60)