🐼 Pandas Cookbook
A practical guide to data manipulation with pandas

Pandas Cookbook: Common Operations for Data Science
This guide serves as a quick reference for the most common pandas operations you’ll use in DS105W. Each section includes practical examples that you can adapt for your own data analysis tasks.
1. Data Import & Export
Reading Data
# From CSV
= pd.read_csv('file.csv')
df
# From Excel
= pd.read_excel('file.xlsx', sheet_name='Sheet1')
df
# From JSON
= pd.read_json('file.json')
df # For nested JSON
= pd.json_normalize(json_data) df
Saving Data
# To CSV
'output.csv', index=False)
df.to_csv(
# To Excel
'output.xlsx', index=False)
df.to_excel(
# To JSON
'output.json') df.to_json(
2. Data Inspection
# Basic information
# Data types and non-null counts
df.info() # Statistical summary
df.describe() # First 5 rows
df.head() # Last 5 rows
df.tail()
# Column names and data types
# List column names
df.columns # Data types of each column
df.dtypes
# Missing values
sum() # Count missing values per column df.isna().
3. Data Selection
Selecting Columns
# Single column
= df['column_name']
series # Multiple columns
= df[['col1', 'col2', 'col3']] subset
Filtering Rows
# Use a boolean mask
= df['column'] > 5
mask = df[mask]
filtered
# Do it all in one line
= df[df['column'] > 5]
filtered
# Multiple conditions
= df[
filtered 'col1'] > 5) &
(df['col2'] == 'value')
(df[
]
# Using isin
= df[df['category'].isin(['A', 'B', 'C'])] filtered
4. Data Transformation
Creating New Columns
# Based on existing columns
'new_col'] = df['old_col'] * 2
df[
# Apply an existing function to each element in a column
'new_col'] = df['text_col'].apply(len)
df[
# Apply a lambda function to each element in a column
'new_col'] = df['col'].apply(lambda x: x.upper()) df[
Renaming Columns
# Rename specific columns
= df.rename(columns={
df 'old_name1': 'new_name1',
'old_name2': 'new_name2'
})
# Clean up all column names
= df.columns.str.lower() # Make lowercase
df.columns = df.columns.str.replace(' ', '_') # Replace spaces df.columns
5. Data Aggregation
Grouping and Summarizing (with agg()
)
# Use groupby() to split the data into groups
# and then apply an existing pandas operation to each group
# The output is a single number for each group
= df.groupby('category')['value'].mean()
summary
# You can apply multiple operations to each group
# Use the `.agg()` method for that
= df.groupby('category').agg({
summary 'col1': 'mean',
'col2': 'sum',
'col3': 'count'
})
# Custom aggregation functions
def custom_metric(x):
return x.max() - x.min()
# Apply a custom aggregation function to each group
= df.groupby('category')['value'].agg(custom_metric) summary
Grouping and Summarizing (with groupby.apply
)
# Define a function to apply to each group
def process_group(group):
return group['col1'].mean()
# Apply the function to each group
= df.groupby('category').apply(process_group)
summary
# Use a more complex function, returning a Series
def process_group(group):
= pd.Series([
output 'col1'].mean(),
group['col2'].sum()
group[=['mean', 'sum'])
], indexreturn output
# Apply the function to each group
= df.groupby('category').apply(process_group) summary
6. Data Reshaping
Melting (Wide to Long)
# Convert wide format to long format
= df.melt(
df_long =['id_column'], # Columns to keep as is
id_vars=['col1', 'col2'], # Columns to unpivot
value_vars='variable', # Name for the variable column
var_name='value' # Name for the value column
value_name )
Exploding Lists
# Expand list elements into separate rows
= df.explode('list_column') df_exploded
7. Data Cleaning
Handling Missing Values
# Drop rows with any missing values
= df.dropna()
df_clean
# Fill missing values
'column'].fillna(0) # With a specific value
df['column'].fillna(df['column'].mean()) # With mean
df['column'].fillna(method='ffill') # Forward fill df[
Removing Duplicates
# Remove duplicate rows
= df.drop_duplicates()
df_unique
# Remove duplicates based on specific columns
= df.drop_duplicates(subset=['col1', 'col2']) df_unique
8. Working with Dates
# Convert to datetime
'date'] = pd.to_datetime(df['date_string'])
df[
# Extract components
'year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df[
# Formatting
# Learn about formatting codes here:
# https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
'date'] = df['date'].dt.strftime('%Y-%m-%d') df[
9. Combining DataFrames
Merging
# Inner join
= pd.merge(df1, df2, on='key_column')
merged
# Left join
= pd.merge(
merged
df1, df2, ='key_column',
on='left'
how
)
# Using different key columns
= pd.merge(
merged
df1, df2,='key1',
left_on='key2'
right_on )
Concatenating
# Vertically (stack)
= pd.concat([df1, df2])
combined
# Horizontally (side by side)
= pd.concat([df1, df2], axis=1) combined