fix(file_parser): handle non-UTF-8 encoded text files with automatic encoding detection
This commit is contained in:
parent
0efd9352a0
commit
390c120fef
4 changed files with 64 additions and 6 deletions
|
|
@ -8,6 +8,56 @@ from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
def _read_text_with_fallback(file_path: str) -> str:
|
||||||
|
"""
|
||||||
|
读取文本文件,UTF-8失败时自动探测编码。
|
||||||
|
|
||||||
|
采用多级回退策略:
|
||||||
|
1. 首先尝试 UTF-8 解码
|
||||||
|
2. 使用 charset_normalizer 检测编码
|
||||||
|
3. 回退到 chardet 检测编码
|
||||||
|
4. 最终使用 UTF-8 + errors='replace' 兜底
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: 文件路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
解码后的文本内容
|
||||||
|
"""
|
||||||
|
data = Path(file_path).read_bytes()
|
||||||
|
|
||||||
|
# 首先尝试 UTF-8
|
||||||
|
try:
|
||||||
|
return data.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 尝试使用 charset_normalizer 检测编码
|
||||||
|
encoding = None
|
||||||
|
try:
|
||||||
|
from charset_normalizer import from_bytes
|
||||||
|
best = from_bytes(data).best()
|
||||||
|
if best and best.encoding:
|
||||||
|
encoding = best.encoding
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 回退到 chardet
|
||||||
|
if not encoding:
|
||||||
|
try:
|
||||||
|
import chardet
|
||||||
|
result = chardet.detect(data)
|
||||||
|
encoding = result.get('encoding') if result else None
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 最终兜底:使用 UTF-8 + replace
|
||||||
|
if not encoding:
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
|
return data.decode(encoding, errors='replace')
|
||||||
|
|
||||||
|
|
||||||
class FileParser:
|
class FileParser:
|
||||||
"""文件解析器"""
|
"""文件解析器"""
|
||||||
|
|
||||||
|
|
@ -62,15 +112,13 @@ class FileParser:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_from_md(file_path: str) -> str:
|
def _extract_from_md(file_path: str) -> str:
|
||||||
"""从Markdown提取文本"""
|
"""从Markdown提取文本,支持自动编码检测"""
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
return _read_text_with_fallback(file_path)
|
||||||
return f.read()
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_from_txt(file_path: str) -> str:
|
def _extract_from_txt(file_path: str) -> str:
|
||||||
"""从TXT提取文本"""
|
"""从TXT提取文本,支持自动编码检测"""
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
return _read_text_with_fallback(file_path)
|
||||||
return f.read()
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def extract_from_multiple(cls, file_paths: List[str]) -> str:
|
def extract_from_multiple(cls, file_paths: List[str]) -> str:
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,9 @@ dependencies = [
|
||||||
|
|
||||||
# 文件处理
|
# 文件处理
|
||||||
"PyMuPDF>=1.24.0",
|
"PyMuPDF>=1.24.0",
|
||||||
|
# 编码检测(支持非UTF-8编码的文本文件)
|
||||||
|
"charset-normalizer>=3.0.0",
|
||||||
|
"chardet>=5.0.0",
|
||||||
|
|
||||||
# 工具库
|
# 工具库
|
||||||
"python-dotenv>=1.0.0",
|
"python-dotenv>=1.0.0",
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,9 @@ camel-ai==0.2.78
|
||||||
|
|
||||||
# ============= 文件处理 =============
|
# ============= 文件处理 =============
|
||||||
PyMuPDF>=1.24.0
|
PyMuPDF>=1.24.0
|
||||||
|
# 编码检测(支持非UTF-8编码的文本文件)
|
||||||
|
charset-normalizer>=3.0.0
|
||||||
|
chardet>=5.0.0
|
||||||
|
|
||||||
# ============= 工具库 =============
|
# ============= 工具库 =============
|
||||||
# 环境变量加载
|
# 环境变量加载
|
||||||
|
|
|
||||||
|
|
@ -1244,6 +1244,8 @@ source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "camel-ai" },
|
{ name = "camel-ai" },
|
||||||
{ name = "camel-oasis" },
|
{ name = "camel-oasis" },
|
||||||
|
{ name = "chardet" },
|
||||||
|
{ name = "charset-normalizer" },
|
||||||
{ name = "flask" },
|
{ name = "flask" },
|
||||||
{ name = "flask-cors" },
|
{ name = "flask-cors" },
|
||||||
{ name = "openai" },
|
{ name = "openai" },
|
||||||
|
|
@ -1270,6 +1272,8 @@ dev = [
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
{ name = "camel-ai", specifier = "==0.2.78" },
|
{ name = "camel-ai", specifier = "==0.2.78" },
|
||||||
{ name = "camel-oasis", specifier = "==0.2.5" },
|
{ name = "camel-oasis", specifier = "==0.2.5" },
|
||||||
|
{ name = "chardet", specifier = ">=5.0.0" },
|
||||||
|
{ name = "charset-normalizer", specifier = ">=3.0.0" },
|
||||||
{ name = "flask", specifier = ">=3.0.0" },
|
{ name = "flask", specifier = ">=3.0.0" },
|
||||||
{ name = "flask-cors", specifier = ">=6.0.0" },
|
{ name = "flask-cors", specifier = ">=6.0.0" },
|
||||||
{ name = "openai", specifier = ">=1.0.0" },
|
{ name = "openai", specifier = ">=1.0.0" },
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue