From 390c120fef9a7161efa1d5cc006f239e364583cd Mon Sep 17 00:00:00 2001
From: 666ghj <670939375@qq.com>
Date: Thu, 22 Jan 2026 18:28:37 +0800
Subject: [PATCH] fix(file_parser): handle non-UTF-8 encoded text files with
 automatic encoding detection

---
 backend/app/utils/file_parser.py | 60 ++++++++++++++++++++++++++++----
 backend/pyproject.toml           |  3 ++
 backend/requirements.txt         |  3 ++
 backend/uv.lock                  |  4 +++
 4 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/backend/app/utils/file_parser.py b/backend/app/utils/file_parser.py
index ac0f636..3f1d8ed 100644
--- a/backend/app/utils/file_parser.py
+++ b/backend/app/utils/file_parser.py
@@ -8,6 +8,56 @@ from pathlib import Path
 from typing import List, Optional
 
 
+def _read_text_with_fallback(file_path: str) -> str:
+    """
+    读取文本文件，UTF-8失败时自动探测编码。
+    
+    采用多级回退策略：
+    1. 首先尝试 UTF-8 解码
+    2. 使用 charset_normalizer 检测编码
+    3. 回退到 chardet 检测编码
+    4. 最终使用 UTF-8 + errors='replace' 兜底
+    
+    Args:
+        file_path: 文件路径
+        
+    Returns:
+        解码后的文本内容
+    """
+    data = Path(file_path).read_bytes()
+    
+    # 首先尝试 UTF-8
+    try:
+        return data.decode('utf-8')
+    except UnicodeDecodeError:
+        pass
+    
+    # 尝试使用 charset_normalizer 检测编码
+    encoding = None
+    try:
+        from charset_normalizer import from_bytes
+        best = from_bytes(data).best()
+        if best and best.encoding:
+            encoding = best.encoding
+    except Exception:
+        pass
+    
+    # 回退到 chardet
+    if not encoding:
+        try:
+            import chardet
+            result = chardet.detect(data)
+            encoding = result.get('encoding') if result else None
+        except Exception:
+            pass
+    
+    # 最终兜底：使用 UTF-8 + replace
+    if not encoding:
+        encoding = 'utf-8'
+    
+    return data.decode(encoding, errors='replace')
+
+
 class FileParser:
     """文件解析器"""
     
@@ -62,15 +112,13 @@ class FileParser:
     
     @staticmethod
     def _extract_from_md(file_path: str) -> str:
-        """从Markdown提取文本"""
-        with open(file_path, 'r', encoding='utf-8') as f:
-            return f.read()
+        """从Markdown提取文本，支持自动编码检测"""
+        return _read_text_with_fallback(file_path)
     
     @staticmethod
     def _extract_from_txt(file_path: str) -> str:
-        """从TXT提取文本"""
-        with open(file_path, 'r', encoding='utf-8') as f:
-            return f.read()
+        """从TXT提取文本，支持自动编码检测"""
+        return _read_text_with_fallback(file_path)
     
     @classmethod
     def extract_from_multiple(cls, file_paths: List[str]) -> str:
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index 8f665c0..4f5361d 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -25,6 +25,9 @@ dependencies = [
     
     # 文件处理
     "PyMuPDF>=1.24.0",
+    # 编码检测（支持非UTF-8编码的文本文件）
+    "charset-normalizer>=3.0.0",
+    "chardet>=5.0.0",
     
     # 工具库
     "python-dotenv>=1.0.0",
diff --git a/backend/requirements.txt b/backend/requirements.txt
index cfe68e0..4f14629 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -23,6 +23,9 @@ camel-ai==0.2.78
 
 # ============= 文件处理 =============
 PyMuPDF>=1.24.0
+# 编码检测（支持非UTF-8编码的文本文件）
+charset-normalizer>=3.0.0
+chardet>=5.0.0
 
 # ============= 工具库 =============
 # 环境变量加载
diff --git a/backend/uv.lock b/backend/uv.lock
index 0b71570..f1ce4b6 100644
--- a/backend/uv.lock
+++ b/backend/uv.lock
@@ -1244,6 +1244,8 @@ source = { editable = "." }
 dependencies = [
     { name = "camel-ai" },
     { name = "camel-oasis" },
+    { name = "chardet" },
+    { name = "charset-normalizer" },
     { name = "flask" },
     { name = "flask-cors" },
     { name = "openai" },
@@ -1270,6 +1272,8 @@ dev = [
 requires-dist = [
     { name = "camel-ai", specifier = "==0.2.78" },
     { name = "camel-oasis", specifier = "==0.2.5" },
+    { name = "chardet", specifier = ">=5.0.0" },
+    { name = "charset-normalizer", specifier = ">=3.0.0" },
     { name = "flask", specifier = ">=3.0.0" },
     { name = "flask-cors", specifier = ">=6.0.0" },
     { name = "openai", specifier = ">=1.0.0" },