"""
Session 1 - First Analysis in VS Code
======================================
File    : Session1-First-Analysis-V02.py
Author  : (your name)
Version : V02
Date    : 2026-05-13

Objective
---------
Load a CSV file from the same folder as this script, compute basic
descriptive statistics (mean, standard deviation) and the Pearson
correlation between Height and Weight, then save the results to a
new CSV file in the same folder.

Expected Folder Layout
----------------------
Chapter-01-First-Analysis/
    Height-Weight.csv               <- Input data (must exist before running)
    Session1-First-Analysis-V02.py  <- This script
    Summary-Session-V02.csv         <- Output produced by this script

Required Input File  (Height-Weight.csv)
----------------------------------------
    Name,Height_cm,Weight_kg
    Ali,175,70
    Sara,160,55
    John,180,75
    Mei,165,60
    Luis,170,68

Column names must match exactly:
    - Height_cm  (numeric, centimetres)
    - Weight_kg  (numeric, kilograms)

Quick Start
-----------
1. Open VS Code and open the Chapter-01-First-Analysis folder.
2. Open Session1-First-Analysis-V02.py in the editor.
3. Run the script (Ctrl+F5, or right-click -> Run Python File in Terminal).
4. Read the console output for a preview and computed statistics.
5. Open Summary-Session-V02.csv in the same folder to inspect saved results.

Step-by-Step Flow
-----------------
Step 1  : Reconfigure stdout/stderr to UTF-8 so special characters print correctly.
Step 2  : Import required libraries: sys, os, pandas.
Step 3  : Define the input CSV filename and resolve its full path.
Step 4  : Load the CSV into a pandas DataFrame.
Step 5  : Print the first five rows as a quick data preview.
Step 6  : Compute mean and standard deviation for Height_cm and Weight_kg.
Step 7  : Print the summary statistics to the console.
Step 8  : Compute the Pearson correlation coefficient between the two columns.
Step 9  : Print the correlation value to the console.
Step 10 : Assemble all results into a small summary DataFrame.
Step 11 : Save the summary DataFrame to Summary-Session-V02.csv.
Step 12 : Print a success message showing the output file path.

Output File  (Summary-Session-V02.csv)
---------------------------------------
    Metric,Value
    Mean Height,<value>
    Std Height,<value>
    Mean Weight,<value>
    Std Weight,<value>
    Correlation,<value>

Notes
-----
- Column names Height_cm and Weight_kg are case-sensitive.
  Update them at the top of the script if your CSV uses different names.
- Pearson correlation ranges from -1 (perfect negative) to +1 (perfect positive).
  A value near 1.0 means taller people tend to weigh more in this dataset.
- The script auto-installs pandas if it is not already present.
- UTF-8 reconfiguration lines at the top prevent garbled output on Windows.
"""

# ===========================================================================
# Step 1 - Reconfigure stdout and stderr to UTF-8
#          Prevents garbled or missing characters on Windows terminals.
# ===========================================================================
import sys

if hasattr(sys.stdout, 'reconfigure'):      # Only available in Python 3.7+
    sys.stdout.reconfigure(encoding='utf-8')
if hasattr(sys.stderr, 'reconfigure'):
    sys.stderr.reconfigure(encoding='utf-8')

# ===========================================================================
# Step 2 - Import standard libraries
# ===========================================================================
import os                                   # Used to build cross-platform file paths

# ---------------------------------------------------------------------------
# Step 2b - Import pandas (auto-install if missing)
# pandas provides the DataFrame structure used throughout this script.
# ---------------------------------------------------------------------------
try:
    import pandas as pd
except ImportError:
    print('[INFO] pandas not found. Attempting automatic install...')
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas'])
    import pandas as pd                     # Re-import after installation
    print('[INFO] pandas installed successfully.')

# ===========================================================================
# Step 3 - Define file paths
#          Both input and output files sit in the same folder as this script.
#          Using os.path.dirname(__file__) makes this portable across machines.
# ===========================================================================
SCRIPT_DIR  = os.path.dirname(os.path.abspath(__file__))   # Folder of this .py file
INPUT_FILE  = 'Height-Weight.csv'                           # Input CSV filename
OUTPUT_FILE = 'Summary-Session-V01.csv'                     # Output CSV filename

input_path  = os.path.join(SCRIPT_DIR, INPUT_FILE)         # Full path to input
output_path = os.path.join(SCRIPT_DIR, OUTPUT_FILE)        # Full path to output

print(f'[DEBUG] Script folder : {SCRIPT_DIR}')
print(f'[DEBUG] Input file    : {input_path}')
print(f'[DEBUG] Output file   : {output_path}')

# ===========================================================================
# Step 4 - Load the CSV file into a pandas DataFrame
#          encoding='utf-8' handles accented characters in names.
#          If the file is missing, pandas raises a clear FileNotFoundError.
# ===========================================================================
print('\n[INFO] Loading data from CSV...')
df = pd.read_csv(input_path, encoding='utf-8')             # df = main data table
print(f'[DEBUG] Rows loaded: {len(df)}  |  Columns: {list(df.columns)}')

# ===========================================================================
# Step 5 - Print a quick data preview (first 5 rows)
# ===========================================================================
print('\n=== Data Preview (first 5 rows) ===')
print(df.head())                                            # Shows up to 5 rows

# ===========================================================================
# Step 6 - Compute descriptive statistics for Height and Weight
#          .mean()  -> arithmetic average
#          .std()   -> sample standard deviation (ddof=1 by default in pandas)
# ===========================================================================
mean_height = df['Height_cm'].mean()    # Average height across all rows
mean_weight = df['Weight_kg'].mean()    # Average weight across all rows
std_height  = df['Height_cm'].std()     # Spread / variability of height values
std_weight  = df['Weight_kg'].std()     # Spread / variability of weight values

# ===========================================================================
# Step 7 - Print the summary statistics
# ===========================================================================
print('\n=== Summary Statistics ===')
print(f'  Average Height : {mean_height:.1f} cm')
print(f'  Std Dev Height : {std_height:.1f} cm')
print(f'  Average Weight : {mean_weight:.1f} kg')
print(f'  Std Dev Weight : {std_weight:.1f} kg')

# ===========================================================================
# Step 8 - Compute Pearson correlation coefficient
#          Measures the linear relationship between Height_cm and Weight_kg.
#          Value of  1.0  -> perfect positive correlation (taller = heavier)
#          Value of  0.0  -> no linear relationship
#          Value of -1.0  -> perfect negative correlation (taller = lighter)
# ===========================================================================
corr = df['Height_cm'].corr(df['Weight_kg'])    # Pearson r (default method)

# ===========================================================================
# Step 9 - Print the correlation value
# ===========================================================================
print(f'\n  Pearson Correlation (Height vs Weight) : {corr:.2f}')

# Quick interpretation hint printed to console
if corr >= 0.8:
    print('  [INFO] Strong positive correlation.')
elif corr >= 0.5:
    print('  [INFO] Moderate positive correlation.')
elif corr >= 0.0:
    print('  [INFO] Weak or no positive correlation.')
else:
    print('  [INFO] Negative correlation detected.')

# ===========================================================================
# Step 10 - Assemble results into a compact summary DataFrame
#           Two columns: Metric (label) and Value (number)
# ===========================================================================
summary = pd.DataFrame({
    'Metric': [
        'Mean Height (cm)',
        'Std Dev Height (cm)',
        'Mean Weight (kg)',
        'Std Dev Weight (kg)',
        'Pearson Correlation',
    ],
    'Value': [
        round(mean_height, 4),
        round(std_height,  4),
        round(mean_weight, 4),
        round(std_weight,  4),
        round(corr,        4),
    ]
})

print('\n=== Results Table ===')
print(summary.to_string(index=False))                      # Print without row numbers

# ===========================================================================
# Step 11 - Save the summary DataFrame to a CSV file
#           index=False omits the automatic row-number column.
# ===========================================================================
summary.to_csv(output_path, index=False, encoding='utf-8')

# ===========================================================================
# Step 12 - Print success message with the output file location
# ===========================================================================
print(f'\n[DONE] Results saved to: {output_path}')
print(f'\npay attention the input and output are both are in the code folder')
print(f'In future sessions we will use')
print(f'A-Data    - Where are input data files are stored')
print(f'B-Engines - Where are python code is stored')
print(f'C-Results - Where the results of the program are stored')