AIDeveloper-PC/requirements_generator/utils/file_handler.py

81 lines
2.3 KiB
Python
Raw Normal View History

2026-03-05 05:38:26 +00:00
# utils/file_handler.py - 文件读取工具(支持 txt / md / pdf / docx
2026-03-04 18:09:45 +00:00
import os
2026-03-05 05:38:26 +00:00
from typing import List
2026-03-04 18:09:45 +00:00
2026-03-05 05:38:26 +00:00
def read_file_auto(file_path: str) -> str:
2026-03-04 18:09:45 +00:00
"""
2026-03-05 05:38:26 +00:00
自动识别文件类型并读取文本内容
支持格式.txt / .md / .pdf / .docx / 其他 UTF-8 读取
2026-03-04 18:09:45 +00:00
Args:
file_path: 文件路径
Returns:
文件文本内容
Raises:
FileNotFoundError: 文件不存在
2026-03-05 05:38:26 +00:00
RuntimeError: 读取失败
2026-03-04 18:09:45 +00:00
"""
2026-03-05 05:38:26 +00:00
if not os.path.exists(file_path):
2026-03-04 18:09:45 +00:00
raise FileNotFoundError(f"文件不存在: {file_path}")
2026-03-05 05:38:26 +00:00
ext = os.path.splitext(file_path)[1].lower()
2026-03-04 18:09:45 +00:00
try:
2026-03-05 05:38:26 +00:00
if ext == ".pdf":
return _read_pdf(file_path)
elif ext in (".docx", ".doc"):
return _read_docx(file_path)
else:
# txt / md / 其他文本格式
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
return f.read()
except Exception as e:
raise RuntimeError(f"读取文件失败 [{file_path}]: {e}")
def _read_pdf(file_path: str) -> str:
"""读取 PDF 文件文本"""
try:
from pypdf import PdfReader
2026-03-04 18:09:45 +00:00
except ImportError:
2026-03-05 05:38:26 +00:00
raise RuntimeError("读取 PDF 需要安装 pypdfpip install pypdf")
2026-03-04 18:09:45 +00:00
2026-03-05 05:38:26 +00:00
reader = PdfReader(file_path)
pages = [page.extract_text() or "" for page in reader.pages]
return "\n".join(pages)
2026-03-04 18:09:45 +00:00
2026-03-05 05:38:26 +00:00
def _read_docx(file_path: str) -> str:
"""读取 Word 文档文本"""
2026-03-04 18:09:45 +00:00
try:
from docx import Document
except ImportError:
2026-03-05 05:38:26 +00:00
raise RuntimeError("读取 docx 需要安装 python-docxpip install python-docx")
2026-03-04 18:09:45 +00:00
2026-03-05 05:38:26 +00:00
doc = Document(file_path)
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return "\n".join(paragraphs)
2026-03-04 18:09:45 +00:00
def merge_knowledge_files(file_paths: List[str]) -> str:
"""
2026-03-05 05:38:26 +00:00
合并多个知识库文件为单一文本
2026-03-04 18:09:45 +00:00
Args:
2026-03-05 05:38:26 +00:00
file_paths: 文件路径列表
2026-03-04 18:09:45 +00:00
Returns:
2026-03-05 05:38:26 +00:00
合并后的文本各文件以分隔线隔开
2026-03-04 18:09:45 +00:00
"""
2026-03-05 05:38:26 +00:00
parts = []
for path in file_paths:
2026-03-04 18:09:45 +00:00
try:
2026-03-05 05:38:26 +00:00
content = read_file_auto(path)
parts.append(f"--- {os.path.basename(path)} ---\n{content}")
2026-03-04 18:09:45 +00:00
except Exception as e:
2026-03-05 05:38:26 +00:00
parts.append(f"--- {os.path.basename(path)} [读取失败: {e}] ---")
return "\n\n".join(parts)