This page shows the source code for Session-3-4-Examples.py in browser-friendly HTML format. It was generated automatically from the original Python file.
"""
Advanced Python - Session 4: Data Analysis with Pandas & NumPy
Code Examples and Projects
Note: Install required packages:
pip install numpy pandas matplotlib seaborn
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
# Set display options
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)
# ============================================
# PART 1: NumPy Fundamentals
# ============================================
print("=" * 60)
print("PART 1: NumPy Fundamentals")
print("=" * 60)
# Example 1: Creating arrays
print("\n--- Example 1: Creating NumPy Arrays ---")
arr1 = np.array([1, 2, 3, 4, 5])
arr2 = np.array([[1, 2, 3], [4, 5, 6]])
zeros = np.zeros(5)
ones = np.ones((3, 3))
range_arr = np.arange(0, 10, 2)
linspace_arr = np.linspace(0, 1, 5)
random_arr = np.random.rand(3, 3)
print(f"1D Array: {arr1}")
print(f"2D Array:\n{arr2}")
print(f"Zeros: {zeros}")
print(f"Linspace: {linspace_arr}")
# Example 2: Array attributes
print("\n--- Example 2: Array Attributes ---")
matrix = np.array([[1, 2, 3], [4, 5, 6]])
print(f"Shape: {matrix.shape}")
print(f"Dimensions: {matrix.ndim}")
print(f"Size: {matrix.size}")
print(f"Data type: {matrix.dtype}")
# Example 3: Array operations
print("\n--- Example 3: Array Operations ---")
a = np.array([1, 2, 3, 4, 5])
b = np.array([10, 20, 30, 40, 50])
print(f"a + b = {a + b}")
print(f"a * 2 = {a * 2}")
print(f"a ** 2 = {a ** 2}")
print(f"a > 3 = {a > 3}")
# Example 4: Array functions
print("\n--- Example 4: Array Functions ---")
data = np.array([12, 15, 18, 22, 25, 28, 30, 35])
print(f"Data: {data}")
print(f"Sum: {np.sum(data)}")
print(f"Mean: {np.mean(data)}")
print(f"Median: {np.median(data)}")
print(f"Std Dev: {np.std(data):.2f}")
print(f"Min: {np.min(data)}, Max: {np.max(data)}")
# Example 5: Indexing and slicing
print("\n--- Example 5: Indexing and Slicing ---")
arr = np.array([10, 20, 30, 40, 50, 60])
print(f"Array: {arr}")
print(f"Element at index 2: {arr[2]}")
print(f"Slice [1:4]: {arr[1:4]}")
print(f"Every 2nd element: {arr[::2]}")
print(f"Reverse: {arr[::-1]}")
print(f"Elements > 30: {arr[arr > 30]}")
# Example 6: 2D array indexing
print("\n--- Example 6: 2D Array Operations ---")
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(f"Matrix:\n{matrix}")
print(f"Element [1,2]: {matrix[1, 2]}")
print(f"Row 1: {matrix[1, :]}")
print(f"Column 2: {matrix[:, 2]}")
print(f"Sum of each column: {np.sum(matrix, axis=0)}")
print(f"Sum of each row: {np.sum(matrix, axis=1)}")
# Example 7: Broadcasting
print("\n--- Example 7: Broadcasting ---")
a = np.array([1, 2, 3])
b = 10
print(f"Array: {a}")
print(f"Array + 10 = {a + b}")
print(f"Array * 2 = {a * 2}")
# Example 8: Statistical operations
print("\n--- Example 8: Statistical Operations ---")
data = np.random.randn(100) # 100 random numbers
print(f"Mean: {np.mean(data):.4f}")
print(f"Std Dev: {np.std(data):.4f}")
print(f"Min: {np.min(data):.4f}")
print(f"Max: {np.max(data):.4f}")
print(f"25th percentile: {np.percentile(data, 25):.4f}")
print(f"75th percentile: {np.percentile(data, 75):.4f}")
# ============================================
# PART 2: Pandas Series
# ============================================
print("\n" + "=" * 60)
print("PART 2: Pandas Series")
print("=" * 60)
# Example 1: Creating Series
print("\n--- Example 1: Creating Series ---")
s1 = pd.Series([10, 20, 30, 40, 50])
s2 = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
s3 = pd.Series({'A': 100, 'B': 200, 'C': 300})
print("Series 1:")
print(s1)
print("\nSeries with custom index:")
print(s2)
print("\nSeries from dictionary:")
print(s3)
# Example 2: Series operations
print("\n--- Example 2: Series Operations ---")
scores = pd.Series([85, 92, 78, 88, 95], index=['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'])
print("Original scores:")
print(scores)
print(f"\nMean: {scores.mean():.2f}")
print(f"Median: {scores.median()}")
print(f"Max: {scores.max()}")
print(f"Scores > 85:\n{scores[scores > 85]}")
# ============================================
# PART 3: Pandas DataFrames
# ============================================
print("\n" + "=" * 60)
print("PART 3: Pandas DataFrames")
print("=" * 60)
# Example 1: Creating DataFrame
print("\n--- Example 1: Creating DataFrame ---")
data = {
'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
'age': [25, 30, 35, 28, 32],
'city': ['NYC', 'LA', 'Chicago', 'NYC', 'LA'],
'salary': [70000, 80000, 75000, 72000, 85000]
}
df = pd.DataFrame(data)
print("DataFrame:")
print(df)
print(f"\nShape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
# Example 2: Viewing data
print("\n--- Example 2: Viewing Data ---")
print("First 3 rows:")
print(df.head(3))
print("\nDataFrame info:")
print(df.info())
print("\nStatistical summary:")
print(df.describe())
# Example 3: Selecting data
print("\n--- Example 3: Selecting Data ---")
print("Single column (Series):")
print(df['name'])
print("\nMultiple columns:")
print(df[['name', 'age']])
print("\nRow by position (iloc):")
print(df.iloc[0])
print("\nRows with age > 28:")
print(df[df['age'] > 28])
# Example 4: Adding columns
print("\n--- Example 4: Adding/Modifying Columns ---")
df['bonus'] = df['salary'] * 0.1
df['total_comp'] = df['salary'] + df['bonus']
df['senior'] = df['age'] >= 30
print(df)
# ============================================
# PART 4: Data Cleaning
# ============================================
print("\n" + "=" * 60)
print("PART 4: Data Cleaning")
print("=" * 60)
# Example 1: Handling missing data
print("\n--- Example 1: Handling Missing Data ---")
df_missing = pd.DataFrame({
'A': [1, 2, np.nan, 4],
'B': [5, np.nan, np.nan, 8],
'C': [9, 10, 11, 12]
})
print("DataFrame with missing values:")
print(df_missing)
print("\nMissing values per column:")
print(df_missing.isnull().sum())
print("\nFill with 0:")
print(df_missing.fillna(0))
print("\nFill with column mean:")
print(df_missing.fillna(df_missing.mean()))
# Example 2: Removing duplicates
print("\n--- Example 2: Removing Duplicates ---")
df_dup = pd.DataFrame({
'name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob'],
'score': [85, 90, 85, 78, 92]
})
print("DataFrame with duplicates:")
print(df_dup)
print("\nAfter removing duplicates:")
print(df_dup.drop_duplicates())
print("\nKeep last duplicate:")
print(df_dup.drop_duplicates(keep='last'))
# Example 3: Data type conversion
print("\n--- Example 3: Data Type Conversion ---")
df_types = pd.DataFrame({
'id': ['1', '2', '3'],
'value': ['100', '200', '300']
})
print("Original dtypes:")
print(df_types.dtypes)
df_types['id'] = df_types['id'].astype(int)
df_types['value'] = df_types['value'].astype(float)
print("\nConverted dtypes:")
print(df_types.dtypes)
print(df_types)
# Example 4: String operations
print("\n--- Example 4: String Operations ---")
df_strings = pd.DataFrame({
'name': [' Alice ', 'BOB', 'charlie'],
'email': ['alice@email.com', 'bob@email.com', 'charlie@email.com']
})
print("Original:")
print(df_strings)
df_strings['name'] = df_strings['name'].str.strip().str.title()
df_strings['has_gmail'] = df_strings['email'].str.contains('gmail')
print("\nAfter string operations:")
print(df_strings)
# ============================================
# PART 5: Data Aggregation & Grouping
# ============================================
print("\n" + "=" * 60)
print("PART 5: Data Aggregation & Grouping")
print("=" * 60)
# Create sample data
sales_data = pd.DataFrame({
'product': ['A', 'B', 'A', 'B', 'A', 'C', 'C', 'B'],
'region': ['East', 'East', 'West', 'West', 'East', 'West', 'East', 'East'],
'sales': [100, 150, 120, 130, 110, 90, 95, 140],
'quantity': [10, 15, 12, 13, 11, 9, 10, 14]
})
# Example 1: Basic aggregation
print("\n--- Example 1: Basic Aggregation ---")
print("Sales data:")
print(sales_data)
print(f"\nTotal sales: ${sales_data['sales'].sum()}")
print(f"Average sales: ${sales_data['sales'].mean():.2f}")
print(f"Max sales: ${sales_data['sales'].max()}")
# Example 2: GroupBy operations
print("\n--- Example 2: GroupBy Operations ---")
print("Sales by product:")
print(sales_data.groupby('product')['sales'].sum())
print("\nSales by region:")
print(sales_data.groupby('region')['sales'].mean())
print("\nMultiple aggregations:")
print(sales_data.groupby('product').agg({
'sales': ['sum', 'mean', 'count'],
'quantity': 'sum'
}))
# Example 3: Group by multiple columns
print("\n--- Example 3: Multiple GroupBy ---")
print("Sales by product and region:")
print(sales_data.groupby(['product', 'region'])['sales'].sum())
# Example 4: Sorting
print("\n--- Example 4: Sorting ---")
print("Sort by sales (descending):")
print(sales_data.sort_values('sales', ascending=False))
# ============================================
# PART 6: Merging & Joining
# ============================================
print("\n" + "=" * 60)
print("PART 6: Merging & Joining")
print("=" * 60)
# Example 1: Concatenation
print("\n--- Example 1: Concatenation ---")
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)
print("\nVertical concatenation:")
print(pd.concat([df1, df2], ignore_index=True))
# Example 2: Merge (join)
print("\n--- Example 2: Merge (Join) ---")
employees = pd.DataFrame({
'emp_id': [1, 2, 3, 4],
'name': ['Alice', 'Bob', 'Charlie', 'Diana']
})
departments = pd.DataFrame({
'emp_id': [1, 2, 3, 5],
'department': ['HR', 'IT', 'Sales', 'Marketing']
})
print("Employees:")
print(employees)
print("\nDepartments:")
print(departments)
print("\nInner join:")
print(pd.merge(employees, departments, on='emp_id'))
print("\nLeft join:")
print(pd.merge(employees, departments, on='emp_id', how='left'))
# ============================================
# PART 7: Data Visualization
# ============================================
print("\n" + "=" * 60)
print("PART 7: Data Visualization")
print("=" * 60)
print("\n--- Creating sample visualizations ---")
# Set style
sns.set_style('whitegrid')
plt.figure(figsize=(15, 10))
# Create sample data for visualization
viz_data = pd.DataFrame({
'month': ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'],
'sales': [1000, 1200, 1100, 1300, 1400, 1350],
'expenses': [800, 900, 850, 950, 1000, 980]
})
# Plot 1: Line plot
plt.subplot(2, 3, 1)
viz_data.plot(x='month', y=['sales', 'expenses'], kind='line', ax=plt.gca())
plt.title('Sales vs Expenses')
plt.ylabel('Amount ($)')
# Plot 2: Bar plot
plt.subplot(2, 3, 2)
viz_data.plot(x='month', y='sales', kind='bar', ax=plt.gca(), color='skyblue')
plt.title('Monthly Sales')
plt.ylabel('Sales ($)')
plt.xticks(rotation=0)
# Plot 3: Histogram
plt.subplot(2, 3, 3)
random_data = np.random.normal(100, 15, 1000)
plt.hist(random_data, bins=30, edgecolor='black', alpha=0.7)
plt.title('Distribution of Values')
plt.xlabel('Value')
plt.ylabel('Frequency')
# Plot 4: Scatter plot
plt.subplot(2, 3, 4)
x = np.random.rand(50) * 100
y = x + np.random.randn(50) * 10
plt.scatter(x, y, alpha=0.6)
plt.title('Scatter Plot')
plt.xlabel('X values')
plt.ylabel('Y values')
# Plot 5: Box plot
plt.subplot(2, 3, 5)
box_data = [np.random.normal(100, 10, 100),
np.random.normal(120, 15, 100),
np.random.normal(90, 12, 100)]
plt.boxplot(box_data, labels=['Group A', 'Group B', 'Group C'])
plt.title('Box Plot Comparison')
plt.ylabel('Values')
# Plot 6: Pie chart
plt.subplot(2, 3, 6)
sizes = [30, 25, 20, 25]
labels = ['Product A', 'Product B', 'Product C', 'Product D']
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
plt.title('Market Share')
plt.tight_layout()
plt.savefig('data_visualizations.png', dpi=100, bbox_inches='tight')
print("Saved visualizations to 'data_visualizations.png'")
plt.close()
# ============================================
# PROJECT 1: Sales Data Analysis
# ============================================
print("\n" + "=" * 60)
print("PROJECT 1: Sales Data Analysis")
print("=" * 60)
# Create comprehensive sales dataset
np.random.seed(42)
dates = pd.date_range('2024-01-01', periods=100, freq='D')
products = ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Headphones']
regions = ['North', 'South', 'East', 'West']
sales_df = pd.DataFrame({
'date': np.random.choice(dates, 500),
'product': np.random.choice(products, 500),
'region': np.random.choice(regions, 500),
'quantity': np.random.randint(1, 20, 500),
'unit_price': np.random.choice([29.99, 79.99, 299.99, 349.99, 149.99], 500)
})
# Calculate total sales
sales_df['total_sales'] = sales_df['quantity'] * sales_df['unit_price']
# Save to CSV
sales_df.to_csv('sales_analysis_data.csv', index=False)
print("Created sales_analysis_data.csv")
print("\n--- Sales Analysis ---")
print(f"Dataset shape: {sales_df.shape}")
print(f"\nFirst few rows:")
print(sales_df.head())
print(f"\n--- Overall Statistics ---")
print(f"Total Revenue: ${sales_df['total_sales'].sum():,.2f}")
print(f"Average Order Value: ${sales_df['total_sales'].mean():,.2f}")
print(f"Total Orders: {len(sales_df)}")
print(f"Total Units Sold: {sales_df['quantity'].sum()}")
print(f"\n--- Sales by Product ---")
product_sales = sales_df.groupby('product').agg({
'total_sales': 'sum',
'quantity': 'sum'
}).sort_values('total_sales', ascending=False)
print(product_sales)
print(f"\n--- Sales by Region ---")
region_sales = sales_df.groupby('region').agg({
'total_sales': ['sum', 'mean', 'count']
})
print(region_sales)
print(f"\n--- Top 5 Sales Days ---")
daily_sales = sales_df.groupby('date')['total_sales'].sum().sort_values(ascending=False)
print(daily_sales.head())
# ============================================
# PROJECT 2: Student Performance Analysis
# ============================================
print("\n" + "=" * 60)
print("PROJECT 2: Student Performance Analysis")
print("=" * 60)
# Create student dataset
np.random.seed(42)
students_df = pd.DataFrame({
'student_id': range(1, 51),
'name': [f'Student_{i}' for i in range(1, 51)],
'math_score': np.random.randint(60, 100, 50),
'science_score': np.random.randint(55, 100, 50),
'english_score': np.random.randint(65, 100, 50),
'class': np.random.choice(['A', 'B', 'C'], 50),
'gender': np.random.choice(['M', 'F'], 50)
})
# Calculate average score
students_df['average_score'] = students_df[['math_score', 'science_score', 'english_score']].mean(axis=1)
# Assign grades
def assign_grade(score):
if score >= 90: return 'A'
elif score >= 80: return 'B'
elif score >= 70: return 'C'
elif score >= 60: return 'D'
else: return 'F'
students_df['grade'] = students_df['average_score'].apply(assign_grade)
# Save to CSV
students_df.to_csv('student_performance.csv', index=False)
print("Created student_performance.csv")
print("\n--- Student Performance Analysis ---")
print(f"Total students: {len(students_df)}")
print(f"\nFirst few students:")
print(students_df.head())
print(f"\n--- Overall Statistics ---")
print(students_df[['math_score', 'science_score', 'english_score', 'average_score']].describe())
print(f"\n--- Grade Distribution ---")
print(students_df['grade'].value_counts().sort_index())
print(f"\n--- Performance by Class ---")
class_performance = students_df.groupby('class').agg({
'math_score': 'mean',
'science_score': 'mean',
'english_score': 'mean',
'average_score': 'mean'
}).round(2)
print(class_performance)
print(f"\n--- Top 10 Students ---")
top_students = students_df.nlargest(10, 'average_score')[['name', 'average_score', 'grade', 'class']]
print(top_students)
print(f"\n--- Students Needing Support (Average < 70) ---")
struggling = students_df[students_df['average_score'] < 70][['name', 'average_score', 'class']]
print(f"Number of students: {len(struggling)}")
if len(struggling) > 0:
print(struggling)
# ============================================
# PROJECT 3: Time Series Analysis
# ============================================
print("\n" + "=" * 60)
print("PROJECT 3: Time Series Analysis")
print("=" * 60)
# Create time series data
date_range = pd.date_range('2024-01-01', periods=365, freq='D')
np.random.seed(42)
# Generate trend + seasonality + noise
trend = np.linspace(100, 150, 365)
seasonality = 10 * np.sin(np.linspace(0, 4*np.pi, 365))
noise = np.random.normal(0, 5, 365)
values = trend + seasonality + noise
timeseries_df = pd.DataFrame({
'date': date_range,
'value': values
})
timeseries_df.set_index('date', inplace=True)
# Save to CSV
timeseries_df.to_csv('timeseries_data.csv')
print("Created timeseries_data.csv")
print("\n--- Time Series Analysis ---")
print(f"Data range: {timeseries_df.index.min()} to {timeseries_df.index.max()}")
print(f"Number of observations: {len(timeseries_df)}")
print(f"\n--- Monthly Statistics ---")
monthly = timeseries_df.resample('M').agg({
'value': ['mean', 'min', 'max', 'std']
}).round(2)
print(monthly.head())
print(f"\n--- Overall Statistics ---")
print(f"Mean: {timeseries_df['value'].mean():.2f}")
print(f"Std Dev: {timeseries_df['value'].std():.2f}")
print(f"Min: {timeseries_df['value'].min():.2f}")
print(f"Max: {timeseries_df['value'].max():.2f}")
# Calculate moving averages
timeseries_df['MA_7'] = timeseries_df['value'].rolling(window=7).mean()
timeseries_df['MA_30'] = timeseries_df['value'].rolling(window=30).mean()
print(f"\n--- Recent Data with Moving Averages ---")
print(timeseries_df.tail(10))
print("\n" + "=" * 60)
print("Advanced Session 4 Completed!")
print("Master Pandas and NumPy for data analysis!")
print("=" * 60)
print("\n--- Generated Files ---")
print("1. sales_analysis_data.csv - Sales data for analysis")
print("2. student_performance.csv - Student grades dataset")
print("3. timeseries_data.csv - Time series data")
print("4. data_visualizations.png - Sample visualizations")
print("\nUse these files for practice and exercises!")