PDF / PPTX / DOCX / XLSX 文档转换工具集
四大办公文档格式的转换、提取、合并、拆分工具。
基于 Python 生态,所有操作均可通过 execute_command 执行 Python 一行命令或短脚本完成。
依赖安装 pip install PyMuPDF pdf2docx python-docx python-pptx openpyxl pandas Pillow pdfplumber
部分操作(DOCX→PDF、PPTX→PDF、XLSX→PDF)需要系统安装 LibreOffice:
# Windows (winget)
winget install LibreOffice.LibreOffice
# macOS
brew install --cask libreoffice
# Ubuntu/Debian
sudo apt install libreoffice
PDF 转图片如需高质量渲染,可选装 poppler(PyMuPDF 内置渲染已足够,poppler 仅作为备选)。
快速参考:支持的全部转换
源格式 目标格式 推荐库 备注
PDF 图片 (PNG/JPG) PyMuPDF 逐页渲染,支持 DPI 控制
PDF DOCX pdf2docx 保留布局、表格、图片
PDF PPTX PyMuPDF + python-pptx 每页一张幻灯片
PDF XLSX pdfplumber + openpyxl 提取表格数据
PDF 文本 (TXT) PyMuPDF 提取纯文本
DOCX PDF LibreOffice (CLI) 最佳保真度
DOCX PPTX python-docx + python-pptx 段落→幻灯片
DOCX HTML python-docx / mammoth 保留基本格式
DOCX 纯文本 python-docx 提取所有段落文本
PPTX PDF LibreOffice (CLI) 最佳保真度
PPTX 图片 (PNG) PyMuPDF 每页导出为图片
PPTX DOCX python-pptx + python-docx 提取所有文本
PPTX 纯文本 python-pptx 提取幻灯片文本
XLSX PDF LibreOffice (CLI) 最佳保真度
XLSX CSV pandas 可指定 sheet
XLSX DOCX openpyxl + python-docx 表格写入 Word
XLSX JSON pandas 结构化数据导出
图片 PDF Pillow + reportlab 多图合并为 PDF
转换命令详解
python -c "
import fitz, sys, os
pdf_path, out_dir = sys.argv[1], sys.argv[2] if len(sys.argv)>2 else '.'
os.makedirs(out_dir, exist_ok=True)
doc = fitz.open(pdf_path)
dpi = int(sys.argv[3]) if len(sys.argv)>3 else 200
fmt = sys.argv[4] if len(sys.argv)>4 else 'png'
for i, page in enumerate(doc):
pix = page.get_pixmap(dpi=dpi)
out = os.path.join(out_dir, f'page_{i+1:04d}.{fmt}')
pix.save(out)
print(f'Saved: {out}')
doc.close()
print(f'Done: {len(doc)} pages -> {out_dir}')
" "input.pdf" "./output_images" 200 png
参数说明:
arg1 — PDF 文件路径
arg2 — 输出目录(默认当前目录)
arg3 — DPI 分辨率(默认 200,推荐 150~300)
arg4 — 图片格式:png(默认)或 jpg
python -c "
from pdf2docx import Converter
import sys
cv = Converter(sys.argv[1])
cv.convert(sys.argv[2] if len(sys.argv)>2 else 'output.docx')
cv.close()
print('Done')
" "input.pdf" "output.docx"
可选参数(通过修改脚本):
start=0, end=None — 指定页码范围
multi_processing=True — 多进程加速大文件
python -c "
import fitz, sys
from pptx import Presentation
from pptx.util import Inches
from pptx.dml.color import RGBColor
import io
pdf_path = sys.argv[1]
out_path = sys.argv[2] if len(sys.argv)>2 else 'output.pptx'
doc = fitz.open(pdf_path)
prs = Presentation()
prs.slide_width = Inches(13.333)
prs.slide_height = Inches(7.5)
blank_layout = prs.slide_layouts[6] # blank
for i, page in enumerate(doc):
pix = page.get_pixmap(dpi=200)
img_data = pix.tobytes('png')
slide = prs.slides.add_slide(blank_layout)
slide.shapes.add_picture(io.BytesIO(img_data), Inches(0), Inches(0), width=prs.slide_width, height=prs.slide_height)
print(f'Page {i+1}/{len(doc)} added')
prs.save(out_path)
doc.close()
print(f'Done: {out_path}')
" "input.pdf" "output.pptx"
python -c "
import pdfplumber, openpyxl, sys
pdf_path = sys.argv[1]
out_path = sys.argv[2] if len(sys.argv)>2 else 'output.xlsx'
wb = openpyxl.Workbook()
ws_total = wb.active
ws_total.title = 'All Tables'
row_offset = 0
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
tables = page.extract_tables()
for t_idx, table in enumerate(tables):
if row_offset == 0 and t_idx == 0:
ws = ws_total
else:
ws = wb.create_sheet(title=f'p{page_num+1}_t{t_idx+1}')
for row in table:
ws.append(row)
row_offset += len(table)
print(f'Page {page_num+1}, Table {t_idx+1}: {len(table)} rows')
if ws_total.max_row == 1 and ws_total.max_column == 1:
wb.remove(ws_total)
wb.save(out_path)
print(f'Done: {out_path}')
" "input.pdf" "output.xlsx"
python -c "
import fitz, sys
doc = fitz.open(sys.argv[1])
out_path = sys.argv[2] if len(sys.argv)>2 else 'output.txt'
with open(out_path, 'w', encoding='utf-8') as f:
for i, page in enumerate(doc):
text = page.get_text()
f.write(f'--- Page {i+1} ---\n{text}\n\n')
print(f'Done: {len(doc)} pages extracted')
doc.close()
" "input.pdf" "output.txt"
# Windows
python -c "
import subprocess, sys, os
docx_path = os.path.abspath(sys.argv[1])
out_dir = os.path.dirname(docx_path)
subprocess.run([
r'C:\Program Files\LibreOffice\program\soffice.exe',
'--headless',
'--convert-to', 'pdf',
'--outdir', out_dir,
docx_path
], check=True)
print(f'Done')
" "input.docx"
# macOS / Linux
soffice --headless --convert-to pdf --outdir ./ "input.docx"
python -c "
from docx import Document
from pptx import Presentation
from pptx.util import Pt, Inches
import sys
docx_path = sys.argv[1]
out_path = sys.argv[2] if len(sys.argv)>2 else 'output.pptx'
doc = Document(docx_path)
prs = Presentation()
prs.slide_width = Inches(13.333)
prs.slide_height = Inches(7.5)
blank_layout = prs.slide_layouts[6]
slide = None
bullet_count = 0
max_bullets = 8