81 lines
2.3 KiB
Python
81 lines
2.3 KiB
Python
# utils/file_handler.py - 文件读取工具(支持 txt / md / pdf / docx)
|
||
import os
|
||
from typing import List
|
||
|
||
|
||
def read_file_auto(file_path: str) -> str:
|
||
"""
|
||
自动识别文件类型并读取文本内容。
|
||
|
||
支持格式:.txt / .md / .pdf / .docx / 其他(按 UTF-8 读取)
|
||
|
||
Args:
|
||
file_path: 文件路径
|
||
|
||
Returns:
|
||
文件文本内容
|
||
|
||
Raises:
|
||
FileNotFoundError: 文件不存在
|
||
RuntimeError: 读取失败
|
||
"""
|
||
if not os.path.exists(file_path):
|
||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||
|
||
ext = os.path.splitext(file_path)[1].lower()
|
||
|
||
try:
|
||
if ext == ".pdf":
|
||
return _read_pdf(file_path)
|
||
elif ext in (".docx", ".doc"):
|
||
return _read_docx(file_path)
|
||
else:
|
||
# txt / md / 其他文本格式
|
||
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
|
||
return f.read()
|
||
except Exception as e:
|
||
raise RuntimeError(f"读取文件失败 [{file_path}]: {e}")
|
||
|
||
|
||
def _read_pdf(file_path: str) -> str:
|
||
"""读取 PDF 文件文本"""
|
||
try:
|
||
from pypdf import PdfReader
|
||
except ImportError:
|
||
raise RuntimeError("读取 PDF 需要安装 pypdf:pip install pypdf")
|
||
|
||
reader = PdfReader(file_path)
|
||
pages = [page.extract_text() or "" for page in reader.pages]
|
||
return "\n".join(pages)
|
||
|
||
|
||
def _read_docx(file_path: str) -> str:
|
||
"""读取 Word 文档文本"""
|
||
try:
|
||
from docx import Document
|
||
except ImportError:
|
||
raise RuntimeError("读取 docx 需要安装 python-docx:pip install python-docx")
|
||
|
||
doc = Document(file_path)
|
||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||
return "\n".join(paragraphs)
|
||
|
||
|
||
def merge_knowledge_files(file_paths: List[str]) -> str:
|
||
"""
|
||
合并多个知识库文件为单一文本。
|
||
|
||
Args:
|
||
file_paths: 文件路径列表
|
||
|
||
Returns:
|
||
合并后的文本(各文件以分隔线隔开)
|
||
"""
|
||
parts = []
|
||
for path in file_paths:
|
||
try:
|
||
content = read_file_auto(path)
|
||
parts.append(f"--- {os.path.basename(path)} ---\n{content}")
|
||
except Exception as e:
|
||
parts.append(f"--- {os.path.basename(path)} [读取失败: {e}] ---")
|
||
return "\n\n".join(parts) |