# utils/file_handler.py - 文件读取工具(支持 txt / md / pdf / docx) import os from typing import List def read_file_auto(file_path: str) -> str: """ 自动识别文件类型并读取文本内容。 支持格式:.txt / .md / .pdf / .docx / 其他(按 UTF-8 读取) Args: file_path: 文件路径 Returns: 文件文本内容 Raises: FileNotFoundError: 文件不存在 RuntimeError: 读取失败 """ if not os.path.exists(file_path): raise FileNotFoundError(f"文件不存在: {file_path}") ext = os.path.splitext(file_path)[1].lower() try: if ext == ".pdf": return _read_pdf(file_path) elif ext in (".docx", ".doc"): return _read_docx(file_path) else: # txt / md / 其他文本格式 with open(file_path, "r", encoding="utf-8", errors="replace") as f: return f.read() except Exception as e: raise RuntimeError(f"读取文件失败 [{file_path}]: {e}") def _read_pdf(file_path: str) -> str: """读取 PDF 文件文本""" try: from pypdf import PdfReader except ImportError: raise RuntimeError("读取 PDF 需要安装 pypdf:pip install pypdf") reader = PdfReader(file_path) pages = [page.extract_text() or "" for page in reader.pages] return "\n".join(pages) def _read_docx(file_path: str) -> str: """读取 Word 文档文本""" try: from docx import Document except ImportError: raise RuntimeError("读取 docx 需要安装 python-docx:pip install python-docx") doc = Document(file_path) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] return "\n".join(paragraphs) def merge_knowledge_files(file_paths: List[str]) -> str: """ 合并多个知识库文件为单一文本。 Args: file_paths: 文件路径列表 Returns: 合并后的文本(各文件以分隔线隔开) """ parts = [] for path in file_paths: try: content = read_file_auto(path) parts.append(f"--- {os.path.basename(path)} ---\n{content}") except Exception as e: parts.append(f"--- {os.path.basename(path)} [读取失败: {e}] ---") return "\n\n".join(parts)