OpenPyXL and pandas

Combine the strengths of OpenPyXL and pandas for efficient Excel data processing and analysis

Why Combine Them

OpenPyXL and pandas each have their strengths. Combining them leverages the best of both:

Feature OpenPyXL pandas
Format Control ✅ Precise style and format control ❌ Limited format control
Data Analysis ❌ Limited analysis features ✅ Powerful data analysis capabilities
Performance ⚠️ Slower for large datasets ✅ High-performance data processing
Charts ✅ Supports native Excel charts ⚠️ Requires other libraries
Formulas ✅ Supports Excel formulas ❌ No formula support

Reading from Excel to pandas

Direct Reading with pandas

import pandas as pd

# Read Excel file
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# Read multiple sheets
dfs = pd.read_excel('data.xlsx', sheet_name=['Sheet1', 'Sheet2'])

# Read all sheets
all_sheets = pd.read_excel('data.xlsx', sheet_name=None)

# Specify reading range
df = pd.read_excel('data.xlsx', usecols='A:D', nrows=100)

Reading with OpenPyXL and Converting

from openpyxl import load_workbook
import pandas as pd

# Read with OpenPyXL
wb = load_workbook('data.xlsx')
ws = wb.active

# Convert to pandas DataFrame
data = ws.values
columns = next(data)  # First row as column names
df = pd.DataFrame(data, columns=columns)

# Or use iter_rows
data = []
for row in ws.iter_rows(min_row=2, values_only=True):
    data.append(row)
df = pd.DataFrame(data, columns=['Column1', 'Column2', 'Column3'])

Writing from pandas to Excel

Direct Writing with pandas

import pandas as pd

# Create DataFrame
df = pd.DataFrame({
    'Name': ['Zhang San', 'Li Si', 'Wang Wu'],
    'Age': [25, 30, 35],
    'City': ['Beijing', 'Shanghai', 'Guangzhou']
})

# Write to Excel
df.to_excel('output.xlsx', index=False)

# Write multiple sheets
with pd.ExcelWriter('output.xlsx') as writer:
    df1.to_excel(writer, sheet_name='Sheet1', index=False)
    df2.to_excel(writer, sheet_name='Sheet2', index=False)

Combining with OpenPyXL for Formatting

import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill

# Write data with pandas
df.to_excel('output.xlsx', index=False)

# Add formatting with OpenPyXL
wb = load_workbook('output.xlsx')
ws = wb.active

# Format header row
header_fill = PatternFill(start_color='4472C4', fill_type='solid')
header_font = Font(bold=True, color='FFFFFF')

for cell in ws[1]:
    cell.fill = header_fill
    cell.font = header_font

# Adjust column widths
for column in ws.columns:
    max_length = 0
    column_letter = column[0].column_letter
    for cell in column:
        if cell.value:
            max_length = max(max_length, len(str(cell.value)))
    ws.column_dimensions[column_letter].width = max_length + 2

wb.save('output.xlsx')

Practical Workflows

Data Analysis Report Generation

import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.chart import BarChart, Reference
from openpyxl.styles import Font, PatternFill

# 1. Data analysis with pandas
df = pd.read_excel('sales_data.xlsx')
summary = df.groupby('Region')['Sales'].sum().reset_index()
summary = summary.sort_values('Sales', ascending=False)

# 2. Create new workbook
wb = Workbook()
ws = wb.active
ws.title = "Sales Analysis"

# 3. Write data
for r in dataframe_to_rows(summary, index=False, header=True):
    ws.append(r)

# 4. Add formatting
header_fill = PatternFill(start_color='4472C4', fill_type='solid')
for cell in ws[1]:
    cell.fill = header_fill
    cell.font = Font(bold=True, color='FFFFFF')

# 5. Add chart
chart = BarChart()
chart.title = "Sales by Region"
data = Reference(ws, min_col=2, min_row=1, max_row=len(summary)+1)
cats = Reference(ws, min_col=1, min_row=2, max_row=len(summary)+1)
chart.add_data(data, titles_from_data=True)
chart.set_categories(cats)
ws.add_chart(chart, "D2")

wb.save('sales_report.xlsx')

Batch Data Processing

import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import PatternFill

# 1. Read and merge multiple Excel files
files = ['data1.xlsx', 'data2.xlsx', 'data3.xlsx']
dfs = [pd.read_excel(f) for f in files]
combined_df = pd.concat(dfs, ignore_index=True)

# 2. Data cleaning and transformation
combined_df = combined_df.drop_duplicates()
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
combined_df['Month'] = combined_df['Date'].dt.month

# 3. Data analysis
monthly_summary = combined_df.groupby('Month').agg({
    'Sales': 'sum',
    'Orders': 'count',
    'Customers': 'nunique'
}).reset_index()

# 4. Write to Excel and format
monthly_summary.to_excel('monthly_report.xlsx', index=False)

wb = load_workbook('monthly_report.xlsx')
ws = wb.active

# Highlight maximum sales
max_sales = monthly_summary['Sales'].max()
for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=2, max_col=2):
    if row[0].value == max_sales:
        row[0].fill = PatternFill(start_color='00FF00', fill_type='solid')

wb.save('monthly_report.xlsx')

Advanced Tips

Using ExcelWriter

Use pandas' ExcelWriter for better control over the writing process, supporting append mode and engine selection.

Preserve Original Formatting

Use mode='a' to append data while preserving existing sheets and formatting.

Large Data Processing

Use pandas' chunksize parameter to read large files in chunks, avoiding memory overflow.

Data Validation

Combine pandas for data validation and OpenPyXL to add data validation rules.

ExcelWriter Example

import pandas as pd

# Use openpyxl engine
with pd.ExcelWriter('output.xlsx', engine='openpyxl') as writer:
    df1.to_excel(writer, sheet_name='Raw Data', index=False)
    df2.to_excel(writer, sheet_name='Summary', index=False)
    
    # Get workbook and worksheet objects
    workbook = writer.book
    worksheet = writer.sheets['Raw Data']
    
    # Add formatting with OpenPyXL
    from openpyxl.styles import Font
    for cell in worksheet[1]:
        cell.font = Font(bold=True)

Append Data to Existing File

import pandas as pd
from openpyxl import load_workbook

# Append new sheet
with pd.ExcelWriter('existing.xlsx', engine='openpyxl', mode='a') as writer:
    df.to_excel(writer, sheet_name='New Data', index=False)

Performance Optimization Tips

Best Practices Summary