142 lines
3.5 KiB
Python
142 lines
3.5 KiB
Python
|
|
# utils/file_handler.py - 文件读取工具(支持 txt/md/pdf/docx)
|
|||
|
|
import os
|
|||
|
|
from typing import List, Optional
|
|||
|
|
from pathlib import Path
|
|||
|
|
|
|||
|
|
|
|||
|
|
def read_text_file(file_path: str) -> str:
|
|||
|
|
"""
|
|||
|
|
读取纯文本文件内容(.txt / .md / .py 等)
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: 文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文件文本内容
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
FileNotFoundError: 文件不存在
|
|||
|
|
UnicodeDecodeError: 编码错误时尝试 latin-1 兜底
|
|||
|
|
"""
|
|||
|
|
path = Path(file_path)
|
|||
|
|
if not path.exists():
|
|||
|
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
|||
|
|
try:
|
|||
|
|
return path.read_text(encoding="utf-8")
|
|||
|
|
except UnicodeDecodeError:
|
|||
|
|
return path.read_text(encoding="latin-1")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def read_pdf_file(file_path: str) -> str:
|
|||
|
|
"""
|
|||
|
|
读取 PDF 文件内容
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: PDF 文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
提取的文本内容
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
ImportError: 未安装 PyPDF2
|
|||
|
|
FileNotFoundError: 文件不存在
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
import PyPDF2
|
|||
|
|
except ImportError:
|
|||
|
|
raise ImportError("请安装 PyPDF2: pip install PyPDF2")
|
|||
|
|
|
|||
|
|
path = Path(file_path)
|
|||
|
|
if not path.exists():
|
|||
|
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
|||
|
|
|
|||
|
|
texts = []
|
|||
|
|
with open(file_path, "rb") as f:
|
|||
|
|
reader = PyPDF2.PdfReader(f)
|
|||
|
|
for page in reader.pages:
|
|||
|
|
text = page.extract_text()
|
|||
|
|
if text:
|
|||
|
|
texts.append(text)
|
|||
|
|
return "\n".join(texts)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def read_docx_file(file_path: str) -> str:
|
|||
|
|
"""
|
|||
|
|
读取 Word (.docx) 文件内容
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: docx 文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
提取的文本内容(段落合并)
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
ImportError: 未安装 python-docx
|
|||
|
|
FileNotFoundError: 文件不存在
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
from docx import Document
|
|||
|
|
except ImportError:
|
|||
|
|
raise ImportError("请安装 python-docx: pip install python-docx")
|
|||
|
|
|
|||
|
|
path = Path(file_path)
|
|||
|
|
if not path.exists():
|
|||
|
|
raise FileNotFoundError(f"文件不存在: {file_path}")
|
|||
|
|
|
|||
|
|
doc = Document(file_path)
|
|||
|
|
return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
|
|||
|
|
|
|||
|
|
|
|||
|
|
def read_file_auto(file_path: str) -> str:
|
|||
|
|
"""
|
|||
|
|
根据文件扩展名自动选择读取方式
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_path: 文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
文件文本内容
|
|||
|
|
|
|||
|
|
Raises:
|
|||
|
|
ValueError: 不支持的文件类型
|
|||
|
|
"""
|
|||
|
|
ext = Path(file_path).suffix.lower()
|
|||
|
|
readers = {
|
|||
|
|
".txt": read_text_file,
|
|||
|
|
".md": read_text_file,
|
|||
|
|
".py": read_text_file,
|
|||
|
|
".json": read_text_file,
|
|||
|
|
".yaml": read_text_file,
|
|||
|
|
".yml": read_text_file,
|
|||
|
|
".pdf": read_pdf_file,
|
|||
|
|
".docx": read_docx_file,
|
|||
|
|
}
|
|||
|
|
reader = readers.get(ext)
|
|||
|
|
if reader is None:
|
|||
|
|
raise ValueError(f"不支持的文件类型: {ext},支持: {list(readers.keys())}")
|
|||
|
|
return reader(file_path)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def merge_knowledge_files(file_paths: List[str]) -> str:
|
|||
|
|
"""
|
|||
|
|
合并多个知识库文件为单一文本
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
file_paths: 知识库文件路径列表
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
合并后的知识库文本(包含文件名分隔符)
|
|||
|
|
"""
|
|||
|
|
if not file_paths:
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
sections = []
|
|||
|
|
for fp in file_paths:
|
|||
|
|
try:
|
|||
|
|
content = read_file_auto(fp)
|
|||
|
|
file_name = Path(fp).name
|
|||
|
|
sections.append(f"### 知识库文件: {file_name}\n{content}")
|
|||
|
|
except Exception as e:
|
|||
|
|
sections.append(f"### 知识库文件: {fp}\n[读取失败: {e}]")
|
|||
|
|
|
|||
|
|
return "\n\n".join(sections)
|