From 390c120fef9a7161efa1d5cc006f239e364583cd Mon Sep 17 00:00:00 2001 From: 666ghj <670939375@qq.com> Date: Thu, 22 Jan 2026 18:28:37 +0800 Subject: [PATCH] fix(file_parser): handle non-UTF-8 encoded text files with automatic encoding detection --- backend/app/utils/file_parser.py | 60 ++++++++++++++++++++++++++++---- backend/pyproject.toml | 3 ++ backend/requirements.txt | 3 ++ backend/uv.lock | 4 +++ 4 files changed, 64 insertions(+), 6 deletions(-) diff --git a/backend/app/utils/file_parser.py b/backend/app/utils/file_parser.py index ac0f636..3f1d8ed 100644 --- a/backend/app/utils/file_parser.py +++ b/backend/app/utils/file_parser.py @@ -8,6 +8,56 @@ from pathlib import Path from typing import List, Optional +def _read_text_with_fallback(file_path: str) -> str: + """ + 读取文本文件,UTF-8失败时自动探测编码。 + + 采用多级回退策略: + 1. 首先尝试 UTF-8 解码 + 2. 使用 charset_normalizer 检测编码 + 3. 回退到 chardet 检测编码 + 4. 最终使用 UTF-8 + errors='replace' 兜底 + + Args: + file_path: 文件路径 + + Returns: + 解码后的文本内容 + """ + data = Path(file_path).read_bytes() + + # 首先尝试 UTF-8 + try: + return data.decode('utf-8') + except UnicodeDecodeError: + pass + + # 尝试使用 charset_normalizer 检测编码 + encoding = None + try: + from charset_normalizer import from_bytes + best = from_bytes(data).best() + if best and best.encoding: + encoding = best.encoding + except Exception: + pass + + # 回退到 chardet + if not encoding: + try: + import chardet + result = chardet.detect(data) + encoding = result.get('encoding') if result else None + except Exception: + pass + + # 最终兜底:使用 UTF-8 + replace + if not encoding: + encoding = 'utf-8' + + return data.decode(encoding, errors='replace') + + class FileParser: """文件解析器""" @@ -62,15 +112,13 @@ class FileParser: @staticmethod def _extract_from_md(file_path: str) -> str: - """从Markdown提取文本""" - with open(file_path, 'r', encoding='utf-8') as f: - return f.read() + """从Markdown提取文本,支持自动编码检测""" + return _read_text_with_fallback(file_path) @staticmethod def _extract_from_txt(file_path: str) -> str: - """从TXT提取文本""" - with open(file_path, 'r', encoding='utf-8') as f: - return f.read() + """从TXT提取文本,支持自动编码检测""" + return _read_text_with_fallback(file_path) @classmethod def extract_from_multiple(cls, file_paths: List[str]) -> str: diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 8f665c0..4f5361d 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -25,6 +25,9 @@ dependencies = [ # 文件处理 "PyMuPDF>=1.24.0", + # 编码检测(支持非UTF-8编码的文本文件) + "charset-normalizer>=3.0.0", + "chardet>=5.0.0", # 工具库 "python-dotenv>=1.0.0", diff --git a/backend/requirements.txt b/backend/requirements.txt index cfe68e0..4f14629 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -23,6 +23,9 @@ camel-ai==0.2.78 # ============= 文件处理 ============= PyMuPDF>=1.24.0 +# 编码检测(支持非UTF-8编码的文本文件) +charset-normalizer>=3.0.0 +chardet>=5.0.0 # ============= 工具库 ============= # 环境变量加载 diff --git a/backend/uv.lock b/backend/uv.lock index 0b71570..f1ce4b6 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -1244,6 +1244,8 @@ source = { editable = "." } dependencies = [ { name = "camel-ai" }, { name = "camel-oasis" }, + { name = "chardet" }, + { name = "charset-normalizer" }, { name = "flask" }, { name = "flask-cors" }, { name = "openai" }, @@ -1270,6 +1272,8 @@ dev = [ requires-dist = [ { name = "camel-ai", specifier = "==0.2.78" }, { name = "camel-oasis", specifier = "==0.2.5" }, + { name = "chardet", specifier = ">=5.0.0" }, + { name = "charset-normalizer", specifier = ">=3.0.0" }, { name = "flask", specifier = ">=3.0.0" }, { name = "flask-cors", specifier = ">=6.0.0" }, { name = "openai", specifier = ">=1.0.0" },