Virtual assistance

Data Analysis with Pandas

Pandas is Python's premier data analysis library. Learn how to work with DataFrames and Series, clean and manipulate data, perform statistical analysis, and create insightful visualizations.

Python Pandas Data Analysis

What is Pandas?

Pandas is an open-source Python library providing high-performance, easy-to-use data structures and data analysis tools. It offers data structures like Series (1-dimensional) and DataFrame (2-dimensional) that make working with structured data intuitive and efficient.

"Pandas enables you to carry out your entire data analysis workflow in Python without having to switch to a more domain-specific language like R."

Installing Pandas and Basic Setup

Getting started with Pandas:

# Install pandas
# pip install pandas

import pandas as pd
import numpy as np

# Check version
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

# Set display options
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 5)
pd.set_option('display.width', 100)

Series - One Dimensional Data

Creating and working with Series:

import pandas as pd
import numpy as np

# Create Series from list
s1 = pd.Series([1, 3, 5, 6, 8])
print("Series from list:")
print(s1)
print(f"Type: {type(s1)}")
print(f"Shape: {s1.shape}")
print(f"Values: {s1.values}")
print(f"Index: {s1.index}")

# Series with custom index
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print("\nSeries with custom index:")
print(s2)

# Series from dictionary
data = {'Alice': 85, 'Bob': 92, 'Charlie': 78, 'Diana': 96}
s3 = pd.Series(data)
print("\nSeries from dictionary:")
print(s3)

# Accessing elements
print(f"\nFirst element: {s3[0]}")
print(f"Element by label: {s3['Bob']}")
print(f"Multiple elements: {s3[['Alice', 'Charlie']]}")

# Series operations
print(f"\nMean: {s3.mean()}")
print(f"Max: {s3.max()}")
print(f"Min: {s3.min()}")
print(f"Standard deviation: {s3.std()}")

# Boolean indexing
print(f"\nScores above 90: {s3[s3 > 90]}")
print(f"Names with scores > 85: {s3[s3 > 85].index.tolist()}")

DataFrame - Two Dimensional Data

Creating and manipulating DataFrames:

import pandas as pd
import numpy as np

# DataFrame from dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Age': [25, 30, 35, 28, 32],
    'City': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney'],
    'Salary': [50000, 60000, 70000, 55000, 65000]
}

df = pd.DataFrame(data)
print("DataFrame from dictionary:")
print(df)
print(f"\nShape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Index: {df.index.tolist()}")

# DataFrame from list of lists
data2 = [
    ['Alice', 25, 'New York', 50000],
    ['Bob', 30, 'London', 60000],
    ['Charlie', 35, 'Paris', 70000]
]
df2 = pd.DataFrame(data2, columns=['Name', 'Age', 'City', 'Salary'])
print("\nDataFrame from list of lists:")
print(df2)

# Accessing columns
print(f"\nNames: {df['Name'].tolist()}")
print(f"Ages: {df.Age.tolist()}")

# Accessing rows
print(f"\nFirst row:\n{df.iloc[0]}")
print(f"Row with index 2:\n{df.loc[2]}")

# Multiple columns
print(f"\nName and Salary:\n{df[['Name', 'Salary']]}")

# Basic statistics
print(f"\nDataFrame statistics:")
print(df.describe())

# Info about DataFrame
print(f"\nDataFrame info:")
print(df.info())

Data Selection and Filtering

Advanced data selection techniques:

import pandas as pd
import numpy as np

# Create sample DataFrame
np.random.seed(42)
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank'],
    'Age': [25, 30, 35, 28, 32, 45],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR', 'Finance'],
    'Salary': [50000, 60000, 70000, 55000, 65000, 80000],
    'Experience': [2, 5, 8, 3, 6, 10]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Boolean indexing
print(f"\nEmployees in IT department:")
print(df[df['Department'] == 'IT'])

print(f"\nEmployees with salary > 60000:")
print(df[df.Salary > 60000])

print(f"\nEmployees aged 25-35:")
print(df[(df.Age >= 25) & (df.Age <= 35)])

# Using query method
print(f"\nUsing query - HR department with experience > 3:")
print(df.query("Department == 'HR' and Experience > 3"))

# isin() method
print(f"\nEmployees in HR or Finance:")
print(df[df['Department'].isin(['HR', 'Finance'])])

# String methods
print(f"\nNames starting with 'A' or 'B':")
print(df[df['Name'].str.startswith(('A', 'B'))])

# nlargest and nsmallest
print(f"\nTop 3 highest salaries:")
print(df.nlargest(3, 'Salary'))

print(f"\nBottom 2 salaries:")
print(df.nsmallest(2, 'Salary'))

# Selecting specific rows and columns
print(f"\nRows 1-3, columns Name and Salary:")
print(df.loc[1:3, ['Name', 'Salary']])

print(f"\nFirst 3 rows, last 2 columns:")
print(df.iloc[:3, -2:])

Data Cleaning and Preprocessing

Handling missing data and data transformation:

import pandas as pd
import numpy as np

# Create DataFrame with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', None],
    'Age': [25, np.nan, 35, 28, 32],
    'Salary': [50000, 60000, np.nan, 55000, 65000],
    'Department': ['HR', 'IT', 'Finance', None, 'HR']
}

df = pd.DataFrame(data)
print("DataFrame with missing values:")
print(df)
print(f"\nMissing values per column:\n{df.isnull().sum()}")

# Drop rows with missing values
df_dropped = df.dropna()
print(f"\nAfter dropping all NaN rows ({len(df) - len(df_dropped)} rows removed):")
print(df_dropped)

# Drop columns with missing values
df_dropped_cols = df.dropna(axis=1)
print(f"\nAfter dropping columns with NaN:")
print(df_dropped_cols)

# Fill missing values
df_filled = df.copy()
df_filled['Age'] = df_filled['Age'].fillna(df_filled['Age'].mean())
df_filled['Salary'] = df_filled['Salary'].fillna(df_filled['Salary'].median())
df_filled['Department'] = df_filled['Department'].fillna('Unknown')
df_filled['Name'] = df_filled['Name'].fillna('Unknown')

print(f"\nAfter filling missing values:")
print(df_filled)

# Forward fill and backward fill
df_ffill = df.fillna(method='ffill')
print(f"\nForward fill:")
print(df_ffill)

df_bfill = df.fillna(method='bfill')
print(f"\nBackward fill:")
print(df_bfill)

# Data type conversion
df_clean = df_filled.copy()
df_clean['Age'] = df_clean['Age'].astype(int)
df_clean['Salary'] = df_clean['Salary'].astype(int)

print(f"\nData types after conversion:")
print(df_clean.dtypes)

# Remove duplicates
df_with_dups = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Alice', 'Charlie'],
    'Age': [25, 30, 25, 35]
})
print(f"\nDataFrame with duplicates:")
print(df_with_dups)

df_no_dups = df_with_dups.drop_duplicates()
print(f"\nAfter removing duplicates ({len(df_with_dups) - len(df_no_dups)} rows removed):")
print(df_no_dups)

# String operations
df_strings = pd.DataFrame({
    'Name': ['alice', 'BOB', 'Charlie', 'diana'],
    'Email': ['alice@email.com', 'bob@email.com', 'charlie@email.com', 'diana@email.com']
})

print(f"\nString operations:")
print(f"Original names: {df_strings['Name'].tolist()}")
print(f"Uppercase: {df_strings['Name'].str.upper().tolist()}")
print(f"Title case: {df_strings['Name'].str.title().tolist()}")
print(f"Contains 'a': {df_strings['Name'].str.contains('a').tolist()}")

# Replace values
df_replaced = df_clean.replace({'Department': {'Unknown': 'General'}})
print(f"\nAfter replacing 'Unknown' with 'General':")
print(df_replaced)

Data Aggregation and Grouping

Group operations and data aggregation:

import pandas as pd
import numpy as np

# Create sample sales data
np.random.seed(42)
data = {
    'Product': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A', 'C'],
    'Region': ['North', 'South', 'North', 'East', 'South', 'North', 'East', 'South', 'North', 'East'],
    'Sales': np.random.randint(100, 1000, 10),
    'Quantity': np.random.randint(1, 20, 10),
    'Month': ['Jan', 'Feb', 'Jan', 'Feb', 'Jan', 'Feb', 'Jan', 'Feb', 'Jan', 'Feb']
}

df = pd.DataFrame(data)
print("Sales DataFrame:")
print(df)

# Group by single column
print(f"\nSales by Product:")
product_sales = df.groupby('Product')['Sales'].sum()
print(product_sales)

# Group by multiple columns
print(f"\nSales by Product and Region:")
product_region_sales = df.groupby(['Product', 'Region'])['Sales'].sum()
print(product_region_sales)

# Multiple aggregations
print(f"\nMultiple aggregations by Product:")
agg_results = df.groupby('Product').agg({
    'Sales': ['sum', 'mean', 'count'],
    'Quantity': ['sum', 'mean']
})
print(agg_results)

# Custom aggregation functions
def range_func(x):
    return x.max() - x.min()

print(f"\nCustom aggregation (range) by Region:")
range_by_region = df.groupby('Region')['Sales'].agg(range_func)
print(range_by_region)

# Using transform for broadcasting results
print(f"\nSales deviation from product mean:")
df['Sales_Deviation'] = df.groupby('Product')['Sales'].transform(lambda x: x - x.mean())
print(df[['Product', 'Sales', 'Sales_Deviation']])

# Pivot tables
print(f"\nPivot table - Sales by Product and Month:")
pivot = df.pivot_table(values='Sales', index='Product', columns='Month', aggfunc='sum', fill_value=0)
print(pivot)

# Cross tabulation
print(f"\nCross tabulation - Product vs Region:")
crosstab = pd.crosstab(df['Product'], df['Region'])
print(crosstab)

# Value counts with normalization
print(f"\nProduct distribution (normalized):")
print(df['Product'].value_counts(normalize=True))

# Group by with filtering
print(f"\nProducts with total sales > 1500:")
high_sales_products = df.groupby('Product').filter(lambda x: x['Sales'].sum() > 1500)
print(high_sales_products['Product'].unique())

Merging and Joining DataFrames

Combining multiple DataFrames:

import pandas as pd

# Create sample DataFrames
employees = pd.DataFrame({
    'EmployeeID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'DepartmentID': [101, 102, 101, 103, 102]
})

departments = pd.DataFrame({
    'DepartmentID': [101, 102, 103, 104],
    'DepartmentName': ['HR', 'IT', 'Finance', 'Marketing'],
    'Location': ['Floor 1', 'Floor 2', 'Floor 3', 'Floor 1']
})

salaries = pd.DataFrame({
    'EmployeeID': [1, 2, 3, 4, 6],
    'Salary': [50000, 60000, 70000, 55000, 65000],
    'Bonus': [5000, 6000, 7000, 5500, 6500]
})

print("Employees DataFrame:")
print(employees)
print("\nDepartments DataFrame:")
print(departments)
print("\nSalaries DataFrame:")
print(salaries)

# Inner join (default)
print(f"\nInner join - Employees with departments:")
emp_dept = pd.merge(employees, departments, on='DepartmentID', how='inner')
print(emp_dept)

# Left join
print(f"\nLeft join - All employees, departments where available:")
emp_dept_left = pd.merge(employees, departments, on='DepartmentID', how='left')
print(emp_dept_left)

# Right join
print(f"\nRight join - All departments, employees where available:")
emp_dept_right = pd.merge(employees, departments, on='DepartmentID', how='right')
print(emp_dept_right)

# Outer join
print(f"\nOuter join - All employees and departments:")
emp_dept_outer = pd.merge(employees, departments, on='DepartmentID', how='outer')
print(emp_dept_outer)

# Multiple joins
print(f"\nComplete employee information:")
complete_info = pd.merge(emp_dept, salaries, on='EmployeeID', how='left')
print(complete_info)

# Join with different column names
employees2 = pd.DataFrame({
    'ID': [1, 2, 3],
    'Name': ['Frank', 'Grace', 'Henry']
})

salaries2 = pd.DataFrame({
    'EmpID': [1, 2, 4],
    'Salary': [75000, 80000, 72000]
})

print(f"\nJoin with different column names:")
joined_diff_cols = pd.merge(employees2, salaries2, left_on='ID', right_on='EmpID', how='outer')
print(joined_diff_cols)

# Concatenation
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

print(f"\nConcatenation:")
concat_result = pd.concat([df1, df2])
print(concat_result)

# Append (deprecated but still works)
print(f"\nAppend:")
append_result = df1.append(df2, ignore_index=True)
print(append_result)

Data Visualization with Pandas

Creating plots and charts:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set style for better plots
plt.style.use('seaborn-v0_8' if 'seaborn-v0_8' in plt.style.available else 'default')

# Create sample data
np.random.seed(42)
dates = pd.date_range('2023-01-01', periods=100, freq='D')
data = {
    'Date': dates,
    'Sales': np.random.randint(1000, 5000, 100),
    'Temperature': np.random.normal(25, 5, 100),
    'Category': np.random.choice(['A', 'B', 'C'], 100)
}

df = pd.DataFrame(data)
df.set_index('Date', inplace=True)

print("Sample data for visualization:")
print(df.head())

# Line plot
print("\nCreating line plot of sales over time...")
df['Sales'].plot(figsize=(10, 6), title='Daily Sales Trend')
plt.ylabel('Sales ($)')
plt.xlabel('Date')
plt.tight_layout()
# plt.show()  # Uncomment to display plot

# Histogram
print("Creating histogram of temperatures...")
df['Temperature'].hist(bins=20, figsize=(8, 6), edgecolor='black')
plt.title('Temperature Distribution')
plt.xlabel('Temperature (°C)')
plt.ylabel('Frequency')
plt.tight_layout()
# plt.show()

# Bar plot
print("Creating bar plot of category counts...")
category_counts = df['Category'].value_counts()
category_counts.plot(kind='bar', figsize=(8, 6), color=['skyblue', 'lightgreen', 'salmon'])
plt.title('Category Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
# plt.show()

# Scatter plot
print("Creating scatter plot of sales vs temperature...")
df.plot.scatter(x='Temperature', y='Sales', figsize=(8, 6), alpha=0.6)
plt.title('Sales vs Temperature')
plt.xlabel('Temperature (°C)')
plt.ylabel('Sales ($)')
plt.tight_layout()
# plt.show()

# Box plot
print("Creating box plot by category...")
df.boxplot(column='Sales', by='Category', figsize=(8, 6))
plt.title('Sales Distribution by Category')
plt.suptitle('')  # Remove automatic suptitle
plt.xlabel('Category')
plt.ylabel('Sales ($)')
plt.tight_layout()
# plt.show()

# Correlation heatmap (using seaborn would be better, but here's a simple version)
print("Creating correlation matrix...")
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numeric_cols].corr()
print("Correlation matrix:")
print(correlation_matrix)

# Simple correlation plot
plt.figure(figsize=(6, 4))
plt.imshow(correlation_matrix, cmap='coolwarm', aspect='auto')
plt.colorbar()
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=45)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
plt.title('Correlation Heatmap')
plt.tight_layout()
# plt.show()

# Moving averages
print("Calculating and plotting moving averages...")
df['Sales_MA7'] = df['Sales'].rolling(window=7).mean()
df['Sales_MA30'] = df['Sales'].rolling(window=30).mean()

df[['Sales', 'Sales_MA7', 'Sales_MA30']].plot(figsize=(12, 6))
plt.title('Sales with Moving Averages')
plt.ylabel('Sales ($)')
plt.xlabel('Date')
plt.legend(['Daily Sales', '7-day MA', '30-day MA'])
plt.tight_layout()
# plt.show()

print("Visualization examples completed. Uncomment plt.show() to display plots.")

Time Series Analysis

Working with time series data:

import pandas as pd
import numpy as np

# Create time series data
dates = pd.date_range('2023-01-01', periods=365, freq='D')
np.random.seed(42)

# Simulate stock prices
initial_price = 100
price_changes = np.random.normal(0, 2, 365)
prices = initial_price + np.cumsum(price_changes)

# Create DataFrame
stock_data = pd.DataFrame({
    'Date': dates,
    'Price': prices,
    'Volume': np.random.randint(1000, 10000, 365),
    'Returns': np.concatenate([[0], np.diff(prices) / prices[:-1]])
})

stock_data.set_index('Date', inplace=True)

print("Stock data sample:")
print(stock_data.head())
print(f"\nData types:\n{stock_data.dtypes}")

# Resampling - convert daily to monthly
monthly_data = stock_data.resample('M').agg({
    'Price': 'last',  # Last price of the month
    'Volume': 'sum',  # Total volume for the month
    'Returns': 'sum'  # Sum of daily returns
})

print(f"\nMonthly resampled data:")
print(monthly_data.head())

# Rolling statistics
stock_data['MA20'] = stock_data['Price'].rolling(window=20).mean()
stock_data['MA50'] = stock_data['Price'].rolling(window=50).mean()
stock_data['Volatility'] = stock_data['Returns'].rolling(window=20).std()

print(f"\nData with moving averages and volatility:")
print(stock_data[['Price', 'MA20', 'MA50', 'Volatility']].tail())

# Time-based filtering
print(f"\nData for January 2023:")
january_data = stock_data.loc['2023-01']
print(january_data.head())

print(f"\nData for Q1 2023:")
q1_data = stock_data.loc['2023-01':'2023-03']
print(f"Q1 records: {len(q1_data)}")

# Shifting for lag analysis
stock_data['Price_Lag1'] = stock_data['Price'].shift(1)
stock_data['Price_Lag2'] = stock_data['Price'].shift(2)

print(f"\nData with lagged prices:")
print(stock_data[['Price', 'Price_Lag1', 'Price_Lag2']].head())

# Percentage changes
stock_data['Pct_Change'] = stock_data['Price'].pct_change()
stock_data['Pct_Change_5d'] = stock_data['Price'].pct_change(periods=5)

print(f"\nPercentage changes:")
print(stock_data[['Price', 'Pct_Change', 'Pct_Change_5d']].head(10))

# Time zone handling
stock_data_utc = stock_data.copy()
stock_data_utc.index = stock_data_utc.index.tz_localize('UTC')
stock_data_est = stock_data_utc.tz_convert('US/Eastern')

print(f"\nTime zone conversion sample:")
print(f"UTC time: {stock_data_utc.index[0]}")
print(f"EST time: {stock_data_est.index[0]}")

# Business day operations
bday_data = stock_data[stock_data.index.weekday < 5]  # Monday to Friday
print(f"\nBusiness days: {len(bday_data)} out of {len(stock_data)} total days")

# Holiday handling
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay

bday_us = CustomBusinessDay(calendar=USFederalHolidayCalendar())
print(f"\nBusiness day offset: {bday_us}")

# Frequency conversion
weekly_data = stock_data.resample('W').last()
print(f"\nWeekly data (last price):")
print(weekly_data.head())

Best Practices

  • Use vectorized operations: Avoid loops when possible
  • Set appropriate data types: Use categories for strings, datetime for dates
  • Handle missing data explicitly: Don't ignore NaN values
  • Use method chaining: df.method1().method2().method3()
  • Index wisely: Choose meaningful indexes for better performance
  • Memory optimization: Use chunking for large files
  • Documentation: Comment complex operations
  • Version control: Track changes in data processing pipelines
  • Testing: Validate data transformations
  • Performance monitoring: Profile slow operations

Common Pitfalls

  • SettingWithCopyWarning: Use .loc for safe assignment
  • Memory issues: Process large datasets in chunks
  • Datetime confusion: Handle timezones properly
  • Index alignment: Be aware of automatic alignment
  • GroupBy gotchas: Understand groupby object behavior
  • Merge issues: Check merge keys and types
  • Performance: Avoid unnecessary copies
  • Data leakage: Be careful with train/test splits

Pandas is an incredibly powerful library for data manipulation and analysis. It provides intuitive data structures and operations that make working with structured data efficient and enjoyable. Combined with NumPy and visualization libraries, it forms the backbone of Python's data science ecosystem.