AIDeveloper-PC/requirements_generator/utils/file_handler.py

142 lines
3.5 KiB
Python
Raw Normal View History

2026-03-04 18:09:45 +00:00
# utils/file_handler.py - 文件读取工具(支持 txt/md/pdf/docx
import os
from typing import List, Optional
from pathlib import Path
def read_text_file(file_path: str) -> str:
"""
读取纯文本文件内容.txt / .md / .py
Args:
file_path: 文件路径
Returns:
文件文本内容
Raises:
FileNotFoundError: 文件不存在
UnicodeDecodeError: 编码错误时尝试 latin-1 兜底
"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"文件不存在: {file_path}")
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError:
return path.read_text(encoding="latin-1")
def read_pdf_file(file_path: str) -> str:
"""
读取 PDF 文件内容
Args:
file_path: PDF 文件路径
Returns:
提取的文本内容
Raises:
ImportError: 未安装 PyPDF2
FileNotFoundError: 文件不存在
"""
try:
import PyPDF2
except ImportError:
raise ImportError("请安装 PyPDF2: pip install PyPDF2")
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"文件不存在: {file_path}")
texts = []
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
text = page.extract_text()
if text:
texts.append(text)
return "\n".join(texts)
def read_docx_file(file_path: str) -> str:
"""
读取 Word (.docx) 文件内容
Args:
file_path: docx 文件路径
Returns:
提取的文本内容段落合并
Raises:
ImportError: 未安装 python-docx
FileNotFoundError: 文件不存在
"""
try:
from docx import Document
except ImportError:
raise ImportError("请安装 python-docx: pip install python-docx")
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"文件不存在: {file_path}")
doc = Document(file_path)
return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
def read_file_auto(file_path: str) -> str:
"""
根据文件扩展名自动选择读取方式
Args:
file_path: 文件路径
Returns:
文件文本内容
Raises:
ValueError: 不支持的文件类型
"""
ext = Path(file_path).suffix.lower()
readers = {
".txt": read_text_file,
".md": read_text_file,
".py": read_text_file,
".json": read_text_file,
".yaml": read_text_file,
".yml": read_text_file,
".pdf": read_pdf_file,
".docx": read_docx_file,
}
reader = readers.get(ext)
if reader is None:
raise ValueError(f"不支持的文件类型: {ext},支持: {list(readers.keys())}")
return reader(file_path)
def merge_knowledge_files(file_paths: List[str]) -> str:
"""
合并多个知识库文件为单一文本
Args:
file_paths: 知识库文件路径列表
Returns:
合并后的知识库文本包含文件名分隔符
"""
if not file_paths:
return ""
sections = []
for fp in file_paths:
try:
content = read_file_auto(fp)
file_name = Path(fp).name
sections.append(f"### 知识库文件: {file_name}\n{content}")
except Exception as e:
sections.append(f"### 知识库文件: {fp}\n[读取失败: {e}]")
return "\n\n".join(sections)