AIDeveloper-PC/requirements_generator/utils/file_handler.py

81 lines
2.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# utils/file_handler.py - 文件读取工具(支持 txt / md / pdf / docx
import os
from typing import List
def read_file_auto(file_path: str) -> str:
"""
自动识别文件类型并读取文本内容。
支持格式:.txt / .md / .pdf / .docx / 其他(按 UTF-8 读取)
Args:
file_path: 文件路径
Returns:
文件文本内容
Raises:
FileNotFoundError: 文件不存在
RuntimeError: 读取失败
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"文件不存在: {file_path}")
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == ".pdf":
return _read_pdf(file_path)
elif ext in (".docx", ".doc"):
return _read_docx(file_path)
else:
# txt / md / 其他文本格式
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
return f.read()
except Exception as e:
raise RuntimeError(f"读取文件失败 [{file_path}]: {e}")
def _read_pdf(file_path: str) -> str:
"""读取 PDF 文件文本"""
try:
from pypdf import PdfReader
except ImportError:
raise RuntimeError("读取 PDF 需要安装 pypdfpip install pypdf")
reader = PdfReader(file_path)
pages = [page.extract_text() or "" for page in reader.pages]
return "\n".join(pages)
def _read_docx(file_path: str) -> str:
"""读取 Word 文档文本"""
try:
from docx import Document
except ImportError:
raise RuntimeError("读取 docx 需要安装 python-docxpip install python-docx")
doc = Document(file_path)
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return "\n".join(paragraphs)
def merge_knowledge_files(file_paths: List[str]) -> str:
"""
合并多个知识库文件为单一文本。
Args:
file_paths: 文件路径列表
Returns:
合并后的文本(各文件以分隔线隔开)
"""
parts = []
for path in file_paths:
try:
content = read_file_auto(path)
parts.append(f"--- {os.path.basename(path)} ---\n{content}")
except Exception as e:
parts.append(f"--- {os.path.basename(path)} [读取失败: {e}] ---")
return "\n\n".join(parts)