Combine the strengths of OpenPyXL and pandas for efficient Excel data processing and analysis
OpenPyXL and pandas each have their strengths. Combining them leverages the best of both:
| Feature | OpenPyXL | pandas |
|---|---|---|
| Format Control | ✅ Precise style and format control | ❌ Limited format control |
| Data Analysis | ❌ Limited analysis features | ✅ Powerful data analysis capabilities |
| Performance | ⚠️ Slower for large datasets | ✅ High-performance data processing |
| Charts | ✅ Supports native Excel charts | ⚠️ Requires other libraries |
| Formulas | ✅ Supports Excel formulas | ❌ No formula support |
import pandas as pd
# Read Excel file
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')
# Read multiple sheets
dfs = pd.read_excel('data.xlsx', sheet_name=['Sheet1', 'Sheet2'])
# Read all sheets
all_sheets = pd.read_excel('data.xlsx', sheet_name=None)
# Specify reading range
df = pd.read_excel('data.xlsx', usecols='A:D', nrows=100)
from openpyxl import load_workbook
import pandas as pd
# Read with OpenPyXL
wb = load_workbook('data.xlsx')
ws = wb.active
# Convert to pandas DataFrame
data = ws.values
columns = next(data) # First row as column names
df = pd.DataFrame(data, columns=columns)
# Or use iter_rows
data = []
for row in ws.iter_rows(min_row=2, values_only=True):
data.append(row)
df = pd.DataFrame(data, columns=['Column1', 'Column2', 'Column3'])
import pandas as pd
# Create DataFrame
df = pd.DataFrame({
'Name': ['Zhang San', 'Li Si', 'Wang Wu'],
'Age': [25, 30, 35],
'City': ['Beijing', 'Shanghai', 'Guangzhou']
})
# Write to Excel
df.to_excel('output.xlsx', index=False)
# Write multiple sheets
with pd.ExcelWriter('output.xlsx') as writer:
df1.to_excel(writer, sheet_name='Sheet1', index=False)
df2.to_excel(writer, sheet_name='Sheet2', index=False)
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill
# Write data with pandas
df.to_excel('output.xlsx', index=False)
# Add formatting with OpenPyXL
wb = load_workbook('output.xlsx')
ws = wb.active
# Format header row
header_fill = PatternFill(start_color='4472C4', fill_type='solid')
header_font = Font(bold=True, color='FFFFFF')
for cell in ws[1]:
cell.fill = header_fill
cell.font = header_font
# Adjust column widths
for column in ws.columns:
max_length = 0
column_letter = column[0].column_letter
for cell in column:
if cell.value:
max_length = max(max_length, len(str(cell.value)))
ws.column_dimensions[column_letter].width = max_length + 2
wb.save('output.xlsx')
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.chart import BarChart, Reference
from openpyxl.styles import Font, PatternFill
# 1. Data analysis with pandas
df = pd.read_excel('sales_data.xlsx')
summary = df.groupby('Region')['Sales'].sum().reset_index()
summary = summary.sort_values('Sales', ascending=False)
# 2. Create new workbook
wb = Workbook()
ws = wb.active
ws.title = "Sales Analysis"
# 3. Write data
for r in dataframe_to_rows(summary, index=False, header=True):
ws.append(r)
# 4. Add formatting
header_fill = PatternFill(start_color='4472C4', fill_type='solid')
for cell in ws[1]:
cell.fill = header_fill
cell.font = Font(bold=True, color='FFFFFF')
# 5. Add chart
chart = BarChart()
chart.title = "Sales by Region"
data = Reference(ws, min_col=2, min_row=1, max_row=len(summary)+1)
cats = Reference(ws, min_col=1, min_row=2, max_row=len(summary)+1)
chart.add_data(data, titles_from_data=True)
chart.set_categories(cats)
ws.add_chart(chart, "D2")
wb.save('sales_report.xlsx')
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
# 1. Read and merge multiple Excel files
files = ['data1.xlsx', 'data2.xlsx', 'data3.xlsx']
dfs = [pd.read_excel(f) for f in files]
combined_df = pd.concat(dfs, ignore_index=True)
# 2. Data cleaning and transformation
combined_df = combined_df.drop_duplicates()
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
combined_df['Month'] = combined_df['Date'].dt.month
# 3. Data analysis
monthly_summary = combined_df.groupby('Month').agg({
'Sales': 'sum',
'Orders': 'count',
'Customers': 'nunique'
}).reset_index()
# 4. Write to Excel and format
monthly_summary.to_excel('monthly_report.xlsx', index=False)
wb = load_workbook('monthly_report.xlsx')
ws = wb.active
# Highlight maximum sales
max_sales = monthly_summary['Sales'].max()
for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=2, max_col=2):
if row[0].value == max_sales:
row[0].fill = PatternFill(start_color='00FF00', fill_type='solid')
wb.save('monthly_report.xlsx')
Use pandas' ExcelWriter for better control over the writing process, supporting append mode and engine selection.
Use mode='a' to append data while preserving existing sheets and formatting.
Use pandas' chunksize parameter to read large files in chunks, avoiding memory overflow.
Combine pandas for data validation and OpenPyXL to add data validation rules.
import pandas as pd
# Use openpyxl engine
with pd.ExcelWriter('output.xlsx', engine='openpyxl') as writer:
df1.to_excel(writer, sheet_name='Raw Data', index=False)
df2.to_excel(writer, sheet_name='Summary', index=False)
# Get workbook and worksheet objects
workbook = writer.book
worksheet = writer.sheets['Raw Data']
# Add formatting with OpenPyXL
from openpyxl.styles import Font
for cell in worksheet[1]:
cell.font = Font(bold=True)
import pandas as pd
from openpyxl import load_workbook
# Append new sheet
with pd.ExcelWriter('existing.xlsx', engine='openpyxl', mode='a') as writer:
df.to_excel(writer, sheet_name='New Data', index=False)