2026-03-05 05:38:26 +00:00
|
|
|
|
# utils/file_handler.py - 文件读取工具(支持 txt / md / pdf / docx)
|
2026-03-04 18:09:45 +00:00
|
|
|
|
import os
|
2026-03-05 05:38:26 +00:00
|
|
|
|
from typing import List
|
2026-03-04 18:09:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
2026-03-05 05:38:26 +00:00
|
|
|
|
def read_file_auto(file_path: str) -> str:
|
2026-03-04 18:09:45 +00:00
|
|
|
|
"""
|
2026-03-05 05:38:26 +00:00
|
|
|
|
自动识别文件类型并读取文本内容。
|
|
|
|
|
|
|
|
|
|
|
|
支持格式:.txt / .md / .pdf / .docx / 其他(按 UTF-8 读取)
|
2026-03-04 18:09:45 +00:00
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
file_path: 文件路径
|
|
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
|
文件文本内容
|
|
|
|
|
|
|
|
|
|
|
|
Raises:
|
|
|
|
|
|
FileNotFoundError: 文件不存在
|
2026-03-05 05:38:26 +00:00
|
|
|
|
RuntimeError: 读取失败
|
2026-03-04 18:09:45 +00:00
|
|
|
|
"""
|
2026-03-05 05:38:26 +00:00
|
|
|
|
if not os.path.exists(file_path):
|
2026-03-04 18:09:45 +00:00
|
|
|
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
|
|
|
|
|
|
2026-03-05 05:38:26 +00:00
|
|
|
|
ext = os.path.splitext(file_path)[1].lower()
|
2026-03-04 18:09:45 +00:00
|
|
|
|
|
|
|
|
|
|
try:
|
2026-03-05 05:38:26 +00:00
|
|
|
|
if ext == ".pdf":
|
|
|
|
|
|
return _read_pdf(file_path)
|
|
|
|
|
|
elif ext in (".docx", ".doc"):
|
|
|
|
|
|
return _read_docx(file_path)
|
|
|
|
|
|
else:
|
|
|
|
|
|
# txt / md / 其他文本格式
|
|
|
|
|
|
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
|
|
|
|
|
|
return f.read()
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
raise RuntimeError(f"读取文件失败 [{file_path}]: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _read_pdf(file_path: str) -> str:
|
|
|
|
|
|
"""读取 PDF 文件文本"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
from pypdf import PdfReader
|
2026-03-04 18:09:45 +00:00
|
|
|
|
except ImportError:
|
2026-03-05 05:38:26 +00:00
|
|
|
|
raise RuntimeError("读取 PDF 需要安装 pypdf:pip install pypdf")
|
2026-03-04 18:09:45 +00:00
|
|
|
|
|
2026-03-05 05:38:26 +00:00
|
|
|
|
reader = PdfReader(file_path)
|
|
|
|
|
|
pages = [page.extract_text() or "" for page in reader.pages]
|
|
|
|
|
|
return "\n".join(pages)
|
2026-03-04 18:09:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
2026-03-05 05:38:26 +00:00
|
|
|
|
def _read_docx(file_path: str) -> str:
|
|
|
|
|
|
"""读取 Word 文档文本"""
|
2026-03-04 18:09:45 +00:00
|
|
|
|
try:
|
|
|
|
|
|
from docx import Document
|
|
|
|
|
|
except ImportError:
|
2026-03-05 05:38:26 +00:00
|
|
|
|
raise RuntimeError("读取 docx 需要安装 python-docx:pip install python-docx")
|
2026-03-04 18:09:45 +00:00
|
|
|
|
|
2026-03-05 05:38:26 +00:00
|
|
|
|
doc = Document(file_path)
|
|
|
|
|
|
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
|
|
|
|
return "\n".join(paragraphs)
|
2026-03-04 18:09:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def merge_knowledge_files(file_paths: List[str]) -> str:
|
|
|
|
|
|
"""
|
2026-03-05 05:38:26 +00:00
|
|
|
|
合并多个知识库文件为单一文本。
|
2026-03-04 18:09:45 +00:00
|
|
|
|
|
|
|
|
|
|
Args:
|
2026-03-05 05:38:26 +00:00
|
|
|
|
file_paths: 文件路径列表
|
2026-03-04 18:09:45 +00:00
|
|
|
|
|
|
|
|
|
|
Returns:
|
2026-03-05 05:38:26 +00:00
|
|
|
|
合并后的文本(各文件以分隔线隔开)
|
2026-03-04 18:09:45 +00:00
|
|
|
|
"""
|
2026-03-05 05:38:26 +00:00
|
|
|
|
parts = []
|
|
|
|
|
|
for path in file_paths:
|
2026-03-04 18:09:45 +00:00
|
|
|
|
try:
|
2026-03-05 05:38:26 +00:00
|
|
|
|
content = read_file_auto(path)
|
|
|
|
|
|
parts.append(f"--- {os.path.basename(path)} ---\n{content}")
|
2026-03-04 18:09:45 +00:00
|
|
|
|
except Exception as e:
|
2026-03-05 05:38:26 +00:00
|
|
|
|
parts.append(f"--- {os.path.basename(path)} [读取失败: {e}] ---")
|
|
|
|
|
|
return "\n\n".join(parts)
|