AIDeveloper-PC/requirements_generator/utils/file_handler.py

142 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# utils/file_handler.py - 文件读取工具(支持 txt/md/pdf/docx
import os
from typing import List, Optional
from pathlib import Path
def read_text_file(file_path: str) -> str:
"""
读取纯文本文件内容(.txt / .md / .py 等)
Args:
file_path: 文件路径
Returns:
文件文本内容
Raises:
FileNotFoundError: 文件不存在
UnicodeDecodeError: 编码错误时尝试 latin-1 兜底
"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"文件不存在: {file_path}")
try:
return path.read_text(encoding="utf-8")
except UnicodeDecodeError:
return path.read_text(encoding="latin-1")
def read_pdf_file(file_path: str) -> str:
"""
读取 PDF 文件内容
Args:
file_path: PDF 文件路径
Returns:
提取的文本内容
Raises:
ImportError: 未安装 PyPDF2
FileNotFoundError: 文件不存在
"""
try:
import PyPDF2
except ImportError:
raise ImportError("请安装 PyPDF2: pip install PyPDF2")
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"文件不存在: {file_path}")
texts = []
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
text = page.extract_text()
if text:
texts.append(text)
return "\n".join(texts)
def read_docx_file(file_path: str) -> str:
"""
读取 Word (.docx) 文件内容
Args:
file_path: docx 文件路径
Returns:
提取的文本内容(段落合并)
Raises:
ImportError: 未安装 python-docx
FileNotFoundError: 文件不存在
"""
try:
from docx import Document
except ImportError:
raise ImportError("请安装 python-docx: pip install python-docx")
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"文件不存在: {file_path}")
doc = Document(file_path)
return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
def read_file_auto(file_path: str) -> str:
"""
根据文件扩展名自动选择读取方式
Args:
file_path: 文件路径
Returns:
文件文本内容
Raises:
ValueError: 不支持的文件类型
"""
ext = Path(file_path).suffix.lower()
readers = {
".txt": read_text_file,
".md": read_text_file,
".py": read_text_file,
".json": read_text_file,
".yaml": read_text_file,
".yml": read_text_file,
".pdf": read_pdf_file,
".docx": read_docx_file,
}
reader = readers.get(ext)
if reader is None:
raise ValueError(f"不支持的文件类型: {ext},支持: {list(readers.keys())}")
return reader(file_path)
def merge_knowledge_files(file_paths: List[str]) -> str:
"""
合并多个知识库文件为单一文本
Args:
file_paths: 知识库文件路径列表
Returns:
合并后的知识库文本(包含文件名分隔符)
"""
if not file_paths:
return ""
sections = []
for fp in file_paths:
try:
content = read_file_auto(fp)
file_name = Path(fp).name
sections.append(f"### 知识库文件: {file_name}\n{content}")
except Exception as e:
sections.append(f"### 知识库文件: {fp}\n[读取失败: {e}]")
return "\n\n".join(sections)