This page shows the source code for 2-analyze_students.py in browser-friendly HTML format. It was generated automatically from the original Python file.
"""
Student Performance Analysis
Demonstrates: correlation analysis, statistical testing, seaborn advanced plots
"""
# import: Imports a module or library
import sys
# if: Conditional statement - executes code if condition is True
# hasattr(): Checks if object has an attribute
# Documentation: https://docs.python.org/3/library/functions.html#hasattr
if hasattr(sys.stdout, 'reconfigure'): # Check condition
# Variable assignment
sys.stdout.reconfigure(encoding='utf-8')
# if: Conditional statement - executes code if condition is True
# hasattr(): Checks if object has an attribute
# Documentation: https://docs.python.org/3/library/functions.html#hasattr
if hasattr(sys.stderr, 'reconfigure'): # Check condition
# Variable assignment
sys.stderr.reconfigure(encoding='utf-8')
# import: Imports a module or library
# as: Creates an alias for an import
import pandas as pd
# import: Imports a module or library
# as: Creates an alias for an import
import matplotlib.pyplot as plt
# import: Imports a module or library
# as: Creates an alias for an import
import seaborn as sns
# import: Imports a module or library
# as: Creates an alias for an import
import numpy as np
# import: Imports a module or library
# from: Imports specific items from a module
from scipy import stats
# Load data
# Variable assignment
df = pd.read_csv('dataset2_students.csv')
# Calculate average score
# Variable assignment
df['Average_Score'] = df[['Math_Score', 'Science_Score', 'English_Score', 'Social_Studies_Score']].mean(axis=1)
# Display basic info
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
# Variable assignment
print("=" * 60)
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print("STUDENT PERFORMANCE ANALYSIS")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
# Variable assignment
print("=" * 60)
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print("\nDataset Shape:", df.shape)
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print("\nFirst few records:")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(df.head())
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print("\nStatistical Summary:")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(df.describe())
# Create comprehensive visualization
# Variable assignment
fig = plt.figure(figsize=(18, 12))
# 1. Correlation Heatmap
plt.subplot(3, 4, 1)
# Variable assignment
numeric_cols = ['Study_Hours_Weekly', 'Attendance_Percent', 'Sleep_Hours',
'Math_Score', 'Science_Score', 'English_Score', 'Social_Studies_Score']
# Variable assignment
correlation_matrix = df[numeric_cols].corr()
# Variable assignment
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm',
# Variable assignment
center=0, square=True, linewidths=1)
# Variable assignment
plt.title('Correlation Matrix', fontsize=12, fontweight='bold')
# 2. Study Hours vs Average Score
plt.subplot(3, 4, 2)
plt.scatter(df['Study_Hours_Weekly'], df['Average_Score'],
# Variable assignment
c=df['Sleep_Hours'], cmap='viridis', alpha=0.6, s=100)
# Variable assignment
plt.colorbar(label='Sleep Hours')
# Variable assignment
z = np.polyfit(df['Study_Hours_Weekly'], df['Average_Score'], 1)
# Variable assignment
p = np.poly1d(z)
plt.plot(df['Study_Hours_Weekly'].sort_values(),
p(df['Study_Hours_Weekly'].sort_values()),
# Variable assignment
"r--", linewidth=2, label='Trend')
plt.xlabel('Study Hours per Week')
plt.ylabel('Average Score')
# Variable assignment
plt.title('Study Hours vs Performance', fontsize=12, fontweight='bold')
plt.legend()
# Variable assignment
plt.grid(True, alpha=0.3)
# 3. Attendance Impact
plt.subplot(3, 4, 3)
# Variable assignment
attendance_bins = pd.cut(df['Attendance_Percent'], bins=[0, 75, 85, 95, 100],
# Variable assignment
labels=['Poor (<75%)', 'Fair (75-85%)', 'Good (85-95%)', 'Excellent (95-100%)'])
# Variable assignment
df['Attendance_Category'] = attendance_bins
# Variable assignment
attendance_performance = df.groupby('Attendance_Category')['Average_Score'].mean()
# Variable assignment
colors_att = ['#FF6B6B', '#FFA07A', '#98D8C8', '#6BCB77']
# len(): Returns the length of an object
# Documentation: https://docs.python.org/3/library/functions.html#len
# range(): Generates a sequence of numbers
# Documentation: https://docs.python.org/3/library/functions.html#range
# Variable assignment
plt.bar(range(len(attendance_performance)), attendance_performance.values, color=colors_att)
# len(): Returns the length of an object
# Documentation: https://docs.python.org/3/library/functions.html#len
# range(): Generates a sequence of numbers
# Documentation: https://docs.python.org/3/library/functions.html#range
# Variable assignment
plt.xticks(range(len(attendance_performance)), attendance_performance.index, rotation=45, ha='right')
plt.ylabel('Average Score')
# Variable assignment
plt.title('Attendance Impact on Performance', fontsize=12, fontweight='bold')
# Variable assignment
plt.grid(axis='y', alpha=0.3)
# 4. Score Distribution by Subject
plt.subplot(3, 4, 4)
# Variable assignment
subjects = ['Math_Score', 'Science_Score', 'English_Score', 'Social_Studies_Score']
# for: Loop that iterates over a sequence
# in: Checks if value is in a sequence
# Variable assignment
violin_data = [df[subject].values for subject in subjects]
# len(): Returns the length of an object
# Documentation: https://docs.python.org/3/library/functions.html#len
# range(): Generates a sequence of numbers
# Documentation: https://docs.python.org/3/library/functions.html#range
# Variable assignment
parts = plt.violinplot(violin_data, positions=range(len(subjects)), showmeans=True, showmedians=True)
# len(): Returns the length of an object
# Documentation: https://docs.python.org/3/library/functions.html#len
# range(): Generates a sequence of numbers
# Documentation: https://docs.python.org/3/library/functions.html#range
# Variable assignment
plt.xticks(range(len(subjects)), ['Math', 'Science', 'English', 'Social Studies'], rotation=45, ha='right')
plt.ylabel('Score')
# Variable assignment
plt.title('Score Distribution by Subject', fontsize=12, fontweight='bold')
# Variable assignment
plt.grid(axis='y', alpha=0.3)
# 5. Gender Performance Comparison
plt.subplot(3, 4, 5)
# Variable assignment
gender_scores = df.groupby('Gender')[subjects].mean()
# len(): Returns the length of an object
# Documentation: https://docs.python.org/3/library/functions.html#len
# Variable assignment
x = np.arange(len(subjects))
# Variable assignment
width = 0.35
# Variable assignment
plt.bar(x - width/2, gender_scores.loc['Female'], width, label='Female', color='#FF6B9D')
# Variable assignment
plt.bar(x + width/2, gender_scores.loc['Male'], width, label='Male', color='#4A90E2')
plt.xlabel('Subject')
plt.ylabel('Average Score')
# Variable assignment
plt.title('Gender Performance Comparison', fontsize=12, fontweight='bold')
# Variable assignment
plt.xticks(x, ['Math', 'Science', 'English', 'Social'], rotation=45, ha='right')
plt.legend()
# Variable assignment
plt.grid(axis='y', alpha=0.3)
# 6. Extracurricular Impact
plt.subplot(3, 4, 6)
# Variable assignment
extra_performance = df.groupby('Extracurricular')['Average_Score'].mean()
plt.bar(['No Activities', 'With Activities'], extra_performance.values,
# Variable assignment
color=['#E17055', '#74B9FF'])
plt.ylabel('Average Score')
# Variable assignment
plt.title('Extracurricular Activities Impact', fontsize=12, fontweight='bold')
# for: Loop that iterates over a sequence
# in: Checks if value is in a sequence
# enumerate(): Returns index and value pairs
# Documentation: https://docs.python.org/3/library/functions.html#enumerate
for i, v in enumerate(extra_performance.values):
# Variable assignment
plt.text(i, v + 1, f'{v:.1f}', ha='center', fontweight='bold')
# Variable assignment
plt.grid(axis='y', alpha=0.3)
# 7. Parent Education Effect
plt.subplot(3, 4, 7)
# Variable assignment
parent_ed_scores = df.groupby('Parent_Education')['Average_Score'].mean().sort_values()
plt.barh(parent_ed_scores.index, parent_ed_scores.values,
# Variable assignment
color=['#FDA7DF', '#B39DDB', '#81C784'])
plt.xlabel('Average Score')
# Variable assignment
plt.title('Parent Education Level Impact', fontsize=12, fontweight='bold')
# Variable assignment
plt.grid(axis='x', alpha=0.3)
# 8. Sleep Hours Distribution
plt.subplot(3, 4, 8)
# Variable assignment
sleep_scores = df.groupby('Sleep_Hours')['Average_Score'].mean()
# Variable assignment
plt.plot(sleep_scores.index, sleep_scores.values, marker='o', linewidth=2,
# Variable assignment
markersize=10, color='#6C5CE7')
plt.xlabel('Sleep Hours')
plt.ylabel('Average Score')
# Variable assignment
plt.title('Sleep vs Academic Performance', fontsize=12, fontweight='bold')
# Variable assignment
plt.grid(True, alpha=0.3)
# 9. Top Performers Analysis
plt.subplot(3, 4, 9)
# Variable assignment
top_10 = df.nlargest(10, 'Average_Score')
# len(): Returns the length of an object
# Documentation: https://docs.python.org/3/library/functions.html#len
# range(): Generates a sequence of numbers
# Documentation: https://docs.python.org/3/library/functions.html#range
# Variable assignment
plt.barh(range(len(top_10)), top_10['Average_Score'].values, color='#00B894')
# len(): Returns the length of an object
# Documentation: https://docs.python.org/3/library/functions.html#len
# range(): Generates a sequence of numbers
# Documentation: https://docs.python.org/3/library/functions.html#range
plt.yticks(range(len(top_10)), top_10['Student_ID'].values)
plt.xlabel('Average Score')
# Variable assignment
plt.title('Top 10 Students', fontsize=12, fontweight='bold')
# Variable assignment
plt.grid(axis='x', alpha=0.3)
# 10. Study Hours Distribution
plt.subplot(3, 4, 10)
# Variable assignment
plt.hist(df['Study_Hours_Weekly'], bins=15, color='#FDCB6E', edgecolor='black', alpha=0.7)
# Variable assignment
plt.axvline(df['Study_Hours_Weekly'].mean(), color='red', linestyle='--',
# Variable assignment
linewidth=2, label=f'Mean: {df["Study_Hours_Weekly"].mean():.1f}h')
plt.xlabel('Study Hours per Week')
plt.ylabel('Number of Students')
# Variable assignment
plt.title('Study Hours Distribution', fontsize=12, fontweight='bold')
plt.legend()
# Variable assignment
plt.grid(axis='y', alpha=0.3)
# 11. Performance Categories
plt.subplot(3, 4, 11)
# Variable assignment
performance_categories = pd.cut(df['Average_Score'],
# Variable assignment
bins=[0, 60, 70, 80, 90, 100],
# Variable assignment
labels=['Failing', 'Pass', 'Good', 'Very Good', 'Excellent'])
# Variable assignment
perf_counts = performance_categories.value_counts()
# Variable assignment
colors_perf = ['#D63031', '#FDCB6E', '#00B894', '#0984E3', '#6C5CE7']
# Variable assignment
plt.pie(perf_counts.values, labels=perf_counts.index, autopct='%1.1f%%',
# Variable assignment
colors=colors_perf, startangle=90)
# Variable assignment
plt.title('Student Performance Distribution', fontsize=12, fontweight='bold')
# 12. Age vs Performance
plt.subplot(3, 4, 12)
# Variable assignment
age_performance = df.groupby('Age')['Average_Score'].agg(['mean', 'std'])
plt.errorbar(age_performance.index, age_performance['mean'],
# Variable assignment
yerr=age_performance['std'], marker='o', capsize=5,
# Variable assignment
linewidth=2, markersize=8, color='#E84393')
plt.xlabel('Age')
plt.ylabel('Average Score')
# Variable assignment
plt.title('Age vs Performance (with std dev)', fontsize=12, fontweight='bold')
# Variable assignment
plt.grid(True, alpha=0.3)
plt.tight_layout()
# Variable assignment
plt.savefig('student_analysis_dashboard.png', dpi=300, bbox_inches='tight')
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print("\n[OK] Dashboard saved as 'student_analysis_dashboard.png'")
plt.show()
# Statistical Analysis
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
# Variable assignment
print("\n" + "=" * 60)
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print("STATISTICAL INSIGHTS")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
# Variable assignment
print("=" * 60)
# Correlation analysis
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print("\n1. CORRELATION WITH AVERAGE SCORE:")
# Variable assignment
correlations = df[['Study_Hours_Weekly', 'Attendance_Percent', 'Sleep_Hours']].corrwith(df['Average_Score'])
# for: Loop that iterates over a sequence
# in: Checks if value is in a sequence
# items(): Returns dictionary key-value pairs
# Documentation: https://docs.python.org/3/library/stdtypes.html#dict.items
for var, corr in correlations.items():
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(f" {var}: {corr:.3f}")
# T-test for extracurricular activities
extra_yes = df[df['Extracurricular'] == 'Yes']['Average_Score']
extra_no = df[df['Extracurricular'] == 'No']['Average_Score']
# Variable assignment
t_stat, p_value = stats.ttest_ind(extra_yes, extra_no)
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(f"\n2. EXTRACURRICULAR ACTIVITIES IMPACT:")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(f" Students with activities: {extra_yes.mean():.2f}")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(f" Students without: {extra_no.mean():.2f}")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(f" T-statistic: {t_stat:.3f}, P-value: {p_value:.4f}")
# if: Conditional statement - executes code if condition is True
if p_value < 0.05:
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(" [SIGNIFICANT] Statistically significant difference!")
# Gender comparison
# Variable assignment
gender_comparison = df.groupby('Gender')['Average_Score'].agg(['mean', 'std'])
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(f"\n3. GENDER PERFORMANCE:")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(gender_comparison)
# Best practices for high performance
high_performers = df[df['Average_Score'] >= 85]
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(f"\n4. HIGH PERFORMERS (Score >= 85) CHARACTERISTICS:")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(f" Average Study Hours: {high_performers['Study_Hours_Weekly'].mean():.1f}")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(f" Average Attendance: {high_performers['Attendance_Percent'].mean():.1f}%")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
print(f" Average Sleep: {high_performers['Sleep_Hours'].mean():.1f} hours")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
# len(): Returns the length of an object
# Documentation: https://docs.python.org/3/library/functions.html#len
# sum(): Sums items in an iterable
# Documentation: https://docs.python.org/3/library/functions.html#sum
print(f" With Extracurriculars: {(high_performers['Extracurricular'] == 'Yes').sum() / len(high_performers) * 100:.1f}%")
# print(): Outputs text to the console
# Documentation: https://docs.python.org/3/library/functions.html#print
# Variable assignment
print("\n" + "=" * 60)