MiroFish/backend/app/utils/file_parser.py

141 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
文件解析工具
支持PDF、Markdown、TXT文件的文本提取
"""
import os
from pathlib import Path
from typing import List, Optional
class FileParser:
"""文件解析器"""
SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'}
@classmethod
def extract_text(cls, file_path: str) -> str:
"""
从文件中提取文本
Args:
file_path: 文件路径
Returns:
提取的文本内容
"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"文件不存在: {file_path}")
suffix = path.suffix.lower()
if suffix not in cls.SUPPORTED_EXTENSIONS:
raise ValueError(f"不支持的文件格式: {suffix}")
if suffix == '.pdf':
return cls._extract_from_pdf(file_path)
elif suffix in {'.md', '.markdown'}:
return cls._extract_from_md(file_path)
elif suffix == '.txt':
return cls._extract_from_txt(file_path)
raise ValueError(f"无法处理的文件格式: {suffix}")
@staticmethod
def _extract_from_pdf(file_path: str) -> str:
"""从PDF提取文本"""
try:
import fitz # PyMuPDF
except ImportError:
raise ImportError("需要安装PyMuPDF: pip install PyMuPDF")
text_parts = []
with fitz.open(file_path) as doc:
for page in doc:
text = page.get_text()
if text.strip():
text_parts.append(text)
return "\n\n".join(text_parts)
@staticmethod
def _extract_from_md(file_path: str) -> str:
"""从Markdown提取文本"""
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
@staticmethod
def _extract_from_txt(file_path: str) -> str:
"""从TXT提取文本"""
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
@classmethod
def extract_from_multiple(cls, file_paths: List[str]) -> str:
"""
从多个文件提取文本并合并
Args:
file_paths: 文件路径列表
Returns:
合并后的文本
"""
all_texts = []
for i, file_path in enumerate(file_paths, 1):
try:
text = cls.extract_text(file_path)
filename = Path(file_path).name
all_texts.append(f"=== 文档 {i}: {filename} ===\n{text}")
except Exception as e:
all_texts.append(f"=== 文档 {i}: {file_path} (提取失败: {str(e)}) ===")
return "\n\n".join(all_texts)
def split_text_into_chunks(
text: str,
chunk_size: int = 500,
overlap: int = 50
) -> List[str]:
"""
将文本分割成小块
Args:
text: 原始文本
chunk_size: 每块的字符数
overlap: 重叠字符数
Returns:
文本块列表
"""
if len(text) <= chunk_size:
return [text] if text.strip() else []
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
# 尝试在句子边界处分割
if end < len(text):
# 查找最近的句子结束符
for sep in ['', '', '', '.\n', '!\n', '?\n', '\n\n', '. ', '! ', '? ']:
last_sep = text[start:end].rfind(sep)
if last_sep != -1 and last_sep > chunk_size * 0.3:
end = start + last_sep + len(sep)
break
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
# 下一个块从重叠位置开始
start = end - overlap if end < len(text) else len(text)
return chunks