142 lines
3.5 KiB
Python
142 lines
3.5 KiB
Python
# utils/file_handler.py - 文件读取工具(支持 txt/md/pdf/docx)
|
||
import os
|
||
from typing import List, Optional
|
||
from pathlib import Path
|
||
|
||
|
||
def read_text_file(file_path: str) -> str:
|
||
"""
|
||
读取纯文本文件内容(.txt / .md / .py 等)
|
||
|
||
Args:
|
||
file_path: 文件路径
|
||
|
||
Returns:
|
||
文件文本内容
|
||
|
||
Raises:
|
||
FileNotFoundError: 文件不存在
|
||
UnicodeDecodeError: 编码错误时尝试 latin-1 兜底
|
||
"""
|
||
path = Path(file_path)
|
||
if not path.exists():
|
||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||
try:
|
||
return path.read_text(encoding="utf-8")
|
||
except UnicodeDecodeError:
|
||
return path.read_text(encoding="latin-1")
|
||
|
||
|
||
def read_pdf_file(file_path: str) -> str:
|
||
"""
|
||
读取 PDF 文件内容
|
||
|
||
Args:
|
||
file_path: PDF 文件路径
|
||
|
||
Returns:
|
||
提取的文本内容
|
||
|
||
Raises:
|
||
ImportError: 未安装 PyPDF2
|
||
FileNotFoundError: 文件不存在
|
||
"""
|
||
try:
|
||
import PyPDF2
|
||
except ImportError:
|
||
raise ImportError("请安装 PyPDF2: pip install PyPDF2")
|
||
|
||
path = Path(file_path)
|
||
if not path.exists():
|
||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||
|
||
texts = []
|
||
with open(file_path, "rb") as f:
|
||
reader = PyPDF2.PdfReader(f)
|
||
for page in reader.pages:
|
||
text = page.extract_text()
|
||
if text:
|
||
texts.append(text)
|
||
return "\n".join(texts)
|
||
|
||
|
||
def read_docx_file(file_path: str) -> str:
|
||
"""
|
||
读取 Word (.docx) 文件内容
|
||
|
||
Args:
|
||
file_path: docx 文件路径
|
||
|
||
Returns:
|
||
提取的文本内容(段落合并)
|
||
|
||
Raises:
|
||
ImportError: 未安装 python-docx
|
||
FileNotFoundError: 文件不存在
|
||
"""
|
||
try:
|
||
from docx import Document
|
||
except ImportError:
|
||
raise ImportError("请安装 python-docx: pip install python-docx")
|
||
|
||
path = Path(file_path)
|
||
if not path.exists():
|
||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||
|
||
doc = Document(file_path)
|
||
return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
|
||
|
||
|
||
def read_file_auto(file_path: str) -> str:
|
||
"""
|
||
根据文件扩展名自动选择读取方式
|
||
|
||
Args:
|
||
file_path: 文件路径
|
||
|
||
Returns:
|
||
文件文本内容
|
||
|
||
Raises:
|
||
ValueError: 不支持的文件类型
|
||
"""
|
||
ext = Path(file_path).suffix.lower()
|
||
readers = {
|
||
".txt": read_text_file,
|
||
".md": read_text_file,
|
||
".py": read_text_file,
|
||
".json": read_text_file,
|
||
".yaml": read_text_file,
|
||
".yml": read_text_file,
|
||
".pdf": read_pdf_file,
|
||
".docx": read_docx_file,
|
||
}
|
||
reader = readers.get(ext)
|
||
if reader is None:
|
||
raise ValueError(f"不支持的文件类型: {ext},支持: {list(readers.keys())}")
|
||
return reader(file_path)
|
||
|
||
|
||
def merge_knowledge_files(file_paths: List[str]) -> str:
|
||
"""
|
||
合并多个知识库文件为单一文本
|
||
|
||
Args:
|
||
file_paths: 知识库文件路径列表
|
||
|
||
Returns:
|
||
合并后的知识库文本(包含文件名分隔符)
|
||
"""
|
||
if not file_paths:
|
||
return ""
|
||
|
||
sections = []
|
||
for fp in file_paths:
|
||
try:
|
||
content = read_file_auto(fp)
|
||
file_name = Path(fp).name
|
||
sections.append(f"### 知识库文件: {file_name}\n{content}")
|
||
except Exception as e:
|
||
sections.append(f"### 知识库文件: {fp}\n[读取失败: {e}]")
|
||
|
||
return "\n\n".join(sections) |