MiroFish/backend/app/services/simulation_runner.py
666ghj 5b4f02f421 Enhance simulation configuration and management features
- Added support for a `max_rounds` parameter in simulation API, allowing users to limit the number of simulation rounds, improving control over simulation duration.
- Updated README.md to reflect the new `max_rounds` parameter and its usage in simulation requests.
- Enhanced error handling for `max_rounds` input validation to ensure it is a positive integer.
- Modified simulation runner and related scripts to incorporate `max_rounds` functionality, ensuring consistent application across Twitter and Reddit simulations.
- Improved logging to indicate when the number of rounds is truncated due to the `max_rounds` setting, enhancing traceability during simulation execution.
2025-12-05 15:50:54 +08:00

933 lines
34 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
OASIS模拟运行器
在后台运行模拟并记录每个Agent的动作支持实时状态监控
"""
import os
import sys
import json
import time
import asyncio
import threading
import subprocess
import signal
import atexit
from typing import Dict, Any, List, Optional
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from queue import Queue
from ..config import Config
from ..utils.logger import get_logger
logger = get_logger('mirofish.simulation_runner')
# 标记是否已注册清理函数
_cleanup_registered = False
class RunnerStatus(str, Enum):
"""运行器状态"""
IDLE = "idle"
STARTING = "starting"
RUNNING = "running"
PAUSED = "paused"
STOPPING = "stopping"
STOPPED = "stopped"
COMPLETED = "completed"
FAILED = "failed"
@dataclass
class AgentAction:
"""Agent动作记录"""
round_num: int
timestamp: str
platform: str # twitter / reddit
agent_id: int
agent_name: str
action_type: str # CREATE_POST, LIKE_POST, etc.
action_args: Dict[str, Any] = field(default_factory=dict)
result: Optional[str] = None
success: bool = True
def to_dict(self) -> Dict[str, Any]:
return {
"round_num": self.round_num,
"timestamp": self.timestamp,
"platform": self.platform,
"agent_id": self.agent_id,
"agent_name": self.agent_name,
"action_type": self.action_type,
"action_args": self.action_args,
"result": self.result,
"success": self.success,
}
@dataclass
class RoundSummary:
"""每轮摘要"""
round_num: int
start_time: str
end_time: Optional[str] = None
simulated_hour: int = 0
twitter_actions: int = 0
reddit_actions: int = 0
active_agents: List[int] = field(default_factory=list)
actions: List[AgentAction] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return {
"round_num": self.round_num,
"start_time": self.start_time,
"end_time": self.end_time,
"simulated_hour": self.simulated_hour,
"twitter_actions": self.twitter_actions,
"reddit_actions": self.reddit_actions,
"active_agents": self.active_agents,
"actions_count": len(self.actions),
"actions": [a.to_dict() for a in self.actions],
}
@dataclass
class SimulationRunState:
"""模拟运行状态(实时)"""
simulation_id: str
runner_status: RunnerStatus = RunnerStatus.IDLE
# 进度信息
current_round: int = 0
total_rounds: int = 0
simulated_hours: int = 0
total_simulation_hours: int = 0
# 平台状态
twitter_running: bool = False
reddit_running: bool = False
twitter_actions_count: int = 0
reddit_actions_count: int = 0
# 每轮摘要
rounds: List[RoundSummary] = field(default_factory=list)
# 最近动作(用于前端实时展示)
recent_actions: List[AgentAction] = field(default_factory=list)
max_recent_actions: int = 50
# 时间戳
started_at: Optional[str] = None
updated_at: str = field(default_factory=lambda: datetime.now().isoformat())
completed_at: Optional[str] = None
# 错误信息
error: Optional[str] = None
# 进程ID用于停止
process_pid: Optional[int] = None
def add_action(self, action: AgentAction):
"""添加动作到最近动作列表"""
self.recent_actions.insert(0, action)
if len(self.recent_actions) > self.max_recent_actions:
self.recent_actions = self.recent_actions[:self.max_recent_actions]
if action.platform == "twitter":
self.twitter_actions_count += 1
else:
self.reddit_actions_count += 1
self.updated_at = datetime.now().isoformat()
def to_dict(self) -> Dict[str, Any]:
return {
"simulation_id": self.simulation_id,
"runner_status": self.runner_status.value,
"current_round": self.current_round,
"total_rounds": self.total_rounds,
"simulated_hours": self.simulated_hours,
"total_simulation_hours": self.total_simulation_hours,
"progress_percent": round(self.current_round / max(self.total_rounds, 1) * 100, 1),
"twitter_running": self.twitter_running,
"reddit_running": self.reddit_running,
"twitter_actions_count": self.twitter_actions_count,
"reddit_actions_count": self.reddit_actions_count,
"total_actions_count": self.twitter_actions_count + self.reddit_actions_count,
"started_at": self.started_at,
"updated_at": self.updated_at,
"completed_at": self.completed_at,
"error": self.error,
"process_pid": self.process_pid,
}
def to_detail_dict(self) -> Dict[str, Any]:
"""包含最近动作的详细信息"""
result = self.to_dict()
result["recent_actions"] = [a.to_dict() for a in self.recent_actions]
result["rounds_count"] = len(self.rounds)
return result
class SimulationRunner:
"""
模拟运行器
负责:
1. 在后台进程中运行OASIS模拟
2. 解析运行日志记录每个Agent的动作
3. 提供实时状态查询接口
4. 支持暂停/停止/恢复操作
"""
# 运行状态存储目录
RUN_STATE_DIR = os.path.join(
os.path.dirname(__file__),
'../../uploads/simulations'
)
# 脚本目录
SCRIPTS_DIR = os.path.join(
os.path.dirname(__file__),
'../../scripts'
)
# 内存中的运行状态
_run_states: Dict[str, SimulationRunState] = {}
_processes: Dict[str, subprocess.Popen] = {}
_action_queues: Dict[str, Queue] = {}
_monitor_threads: Dict[str, threading.Thread] = {}
_stdout_files: Dict[str, Any] = {} # 存储 stdout 文件句柄
_stderr_files: Dict[str, Any] = {} # 存储 stderr 文件句柄
@classmethod
def get_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]:
"""获取运行状态"""
if simulation_id in cls._run_states:
return cls._run_states[simulation_id]
# 尝试从文件加载
state = cls._load_run_state(simulation_id)
if state:
cls._run_states[simulation_id] = state
return state
@classmethod
def _load_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]:
"""从文件加载运行状态"""
state_file = os.path.join(cls.RUN_STATE_DIR, simulation_id, "run_state.json")
if not os.path.exists(state_file):
return None
try:
with open(state_file, 'r', encoding='utf-8') as f:
data = json.load(f)
state = SimulationRunState(
simulation_id=simulation_id,
runner_status=RunnerStatus(data.get("runner_status", "idle")),
current_round=data.get("current_round", 0),
total_rounds=data.get("total_rounds", 0),
simulated_hours=data.get("simulated_hours", 0),
total_simulation_hours=data.get("total_simulation_hours", 0),
twitter_running=data.get("twitter_running", False),
reddit_running=data.get("reddit_running", False),
twitter_actions_count=data.get("twitter_actions_count", 0),
reddit_actions_count=data.get("reddit_actions_count", 0),
started_at=data.get("started_at"),
updated_at=data.get("updated_at", datetime.now().isoformat()),
completed_at=data.get("completed_at"),
error=data.get("error"),
process_pid=data.get("process_pid"),
)
# 加载最近动作
actions_data = data.get("recent_actions", [])
for a in actions_data:
state.recent_actions.append(AgentAction(
round_num=a.get("round_num", 0),
timestamp=a.get("timestamp", ""),
platform=a.get("platform", ""),
agent_id=a.get("agent_id", 0),
agent_name=a.get("agent_name", ""),
action_type=a.get("action_type", ""),
action_args=a.get("action_args", {}),
result=a.get("result"),
success=a.get("success", True),
))
return state
except Exception as e:
logger.error(f"加载运行状态失败: {str(e)}")
return None
@classmethod
def _save_run_state(cls, state: SimulationRunState):
"""保存运行状态到文件"""
sim_dir = os.path.join(cls.RUN_STATE_DIR, state.simulation_id)
os.makedirs(sim_dir, exist_ok=True)
state_file = os.path.join(sim_dir, "run_state.json")
data = state.to_detail_dict()
with open(state_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
cls._run_states[state.simulation_id] = state
@classmethod
def start_simulation(
cls,
simulation_id: str,
platform: str = "parallel", # twitter / reddit / parallel
max_rounds: int = None # 最大模拟轮数(可选,用于截断过长的模拟)
) -> SimulationRunState:
"""
启动模拟
Args:
simulation_id: 模拟ID
platform: 运行平台 (twitter/reddit/parallel)
max_rounds: 最大模拟轮数(可选,用于截断过长的模拟)
Returns:
SimulationRunState
"""
# 检查是否已在运行
existing = cls.get_run_state(simulation_id)
if existing and existing.runner_status in [RunnerStatus.RUNNING, RunnerStatus.STARTING]:
raise ValueError(f"模拟已在运行中: {simulation_id}")
# 加载模拟配置
sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id)
config_path = os.path.join(sim_dir, "simulation_config.json")
if not os.path.exists(config_path):
raise ValueError(f"模拟配置不存在,请先调用 /prepare 接口")
with open(config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
# 初始化运行状态
time_config = config.get("time_config", {})
total_hours = time_config.get("total_simulation_hours", 72)
minutes_per_round = time_config.get("minutes_per_round", 30)
total_rounds = int(total_hours * 60 / minutes_per_round)
# 如果指定了最大轮数,则截断
if max_rounds is not None and max_rounds > 0:
original_rounds = total_rounds
total_rounds = min(total_rounds, max_rounds)
if total_rounds < original_rounds:
logger.info(f"轮数已截断: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})")
state = SimulationRunState(
simulation_id=simulation_id,
runner_status=RunnerStatus.STARTING,
total_rounds=total_rounds,
total_simulation_hours=total_hours,
started_at=datetime.now().isoformat(),
)
cls._save_run_state(state)
# 确定运行哪个脚本(脚本位于 backend/scripts/ 目录)
if platform == "twitter":
script_name = "run_twitter_simulation.py"
state.twitter_running = True
elif platform == "reddit":
script_name = "run_reddit_simulation.py"
state.reddit_running = True
else:
script_name = "run_parallel_simulation.py"
state.twitter_running = True
state.reddit_running = True
script_path = os.path.join(cls.SCRIPTS_DIR, script_name)
if not os.path.exists(script_path):
raise ValueError(f"脚本不存在: {script_path}")
# 创建动作队列
action_queue = Queue()
cls._action_queues[simulation_id] = action_queue
# 启动模拟进程
try:
# 构建运行命令,使用完整路径
# 新的日志结构:
# twitter/actions.jsonl - Twitter 动作日志
# reddit/actions.jsonl - Reddit 动作日志
# simulation.log - 主进程日志
cmd = [
sys.executable, # Python解释器
script_path,
"--config", config_path, # 使用完整配置文件路径
]
# 如果指定了最大轮数,添加到命令行参数
if max_rounds is not None and max_rounds > 0:
cmd.extend(["--max-rounds", str(max_rounds)])
# 创建主日志文件,避免 stdout/stderr 管道缓冲区满导致进程阻塞
main_log_path = os.path.join(sim_dir, "simulation.log")
main_log_file = open(main_log_path, 'w', encoding='utf-8')
# 设置工作目录为模拟目录(数据库等文件会生成在此)
# 使用 start_new_session=True 创建新的进程组,确保可以通过 os.killpg 终止所有子进程
process = subprocess.Popen(
cmd,
cwd=sim_dir,
stdout=main_log_file,
stderr=subprocess.STDOUT, # stderr 也写入同一个文件
text=True,
bufsize=1,
start_new_session=True, # 创建新进程组,确保服务器关闭时能终止所有相关进程
)
# 保存文件句柄以便后续关闭
cls._stdout_files[simulation_id] = main_log_file
cls._stderr_files[simulation_id] = None # 不再需要单独的 stderr
state.process_pid = process.pid
state.runner_status = RunnerStatus.RUNNING
cls._processes[simulation_id] = process
cls._save_run_state(state)
# 启动监控线程
monitor_thread = threading.Thread(
target=cls._monitor_simulation,
args=(simulation_id,),
daemon=True
)
monitor_thread.start()
cls._monitor_threads[simulation_id] = monitor_thread
logger.info(f"模拟启动成功: {simulation_id}, pid={process.pid}, platform={platform}")
except Exception as e:
state.runner_status = RunnerStatus.FAILED
state.error = str(e)
cls._save_run_state(state)
raise
return state
@classmethod
def _monitor_simulation(cls, simulation_id: str):
"""监控模拟进程,解析动作日志"""
sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id)
# 新的日志结构:分平台的动作日志
twitter_actions_log = os.path.join(sim_dir, "twitter", "actions.jsonl")
reddit_actions_log = os.path.join(sim_dir, "reddit", "actions.jsonl")
process = cls._processes.get(simulation_id)
state = cls.get_run_state(simulation_id)
if not process or not state:
return
twitter_position = 0
reddit_position = 0
try:
while process.poll() is None: # 进程仍在运行
# 读取 Twitter 动作日志
if os.path.exists(twitter_actions_log):
twitter_position = cls._read_action_log(
twitter_actions_log, twitter_position, state, "twitter"
)
# 读取 Reddit 动作日志
if os.path.exists(reddit_actions_log):
reddit_position = cls._read_action_log(
reddit_actions_log, reddit_position, state, "reddit"
)
# 更新状态
cls._save_run_state(state)
time.sleep(2)
# 进程结束后,最后读取一次日志
if os.path.exists(twitter_actions_log):
cls._read_action_log(twitter_actions_log, twitter_position, state, "twitter")
if os.path.exists(reddit_actions_log):
cls._read_action_log(reddit_actions_log, reddit_position, state, "reddit")
# 进程结束
exit_code = process.returncode
if exit_code == 0:
state.runner_status = RunnerStatus.COMPLETED
state.completed_at = datetime.now().isoformat()
logger.info(f"模拟完成: {simulation_id}")
else:
state.runner_status = RunnerStatus.FAILED
# 从主日志文件读取错误信息
main_log_path = os.path.join(sim_dir, "simulation.log")
error_info = ""
try:
if os.path.exists(main_log_path):
with open(main_log_path, 'r', encoding='utf-8') as f:
error_info = f.read()[-2000:] # 取最后2000字符
except Exception:
pass
state.error = f"进程退出码: {exit_code}, 错误: {error_info}"
logger.error(f"模拟失败: {simulation_id}, error={state.error}")
state.twitter_running = False
state.reddit_running = False
cls._save_run_state(state)
except Exception as e:
logger.error(f"监控线程异常: {simulation_id}, error={str(e)}")
state.runner_status = RunnerStatus.FAILED
state.error = str(e)
cls._save_run_state(state)
finally:
# 清理进程资源
cls._processes.pop(simulation_id, None)
cls._action_queues.pop(simulation_id, None)
# 关闭日志文件句柄
if simulation_id in cls._stdout_files:
try:
cls._stdout_files[simulation_id].close()
except Exception:
pass
cls._stdout_files.pop(simulation_id, None)
if simulation_id in cls._stderr_files and cls._stderr_files[simulation_id]:
try:
cls._stderr_files[simulation_id].close()
except Exception:
pass
cls._stderr_files.pop(simulation_id, None)
@classmethod
def _read_action_log(
cls,
log_path: str,
position: int,
state: SimulationRunState,
platform: str
) -> int:
"""
读取动作日志文件
Args:
log_path: 日志文件路径
position: 上次读取位置
state: 运行状态对象
platform: 平台名称 (twitter/reddit)
Returns:
新的读取位置
"""
try:
with open(log_path, 'r', encoding='utf-8') as f:
f.seek(position)
for line in f:
line = line.strip()
if line:
try:
action_data = json.loads(line)
# 跳过事件类型的条目(如 simulation_start, round_start 等)
if "event_type" in action_data:
continue
action = AgentAction(
round_num=action_data.get("round", 0),
timestamp=action_data.get("timestamp", datetime.now().isoformat()),
platform=platform,
agent_id=action_data.get("agent_id", 0),
agent_name=action_data.get("agent_name", ""),
action_type=action_data.get("action_type", ""),
action_args=action_data.get("action_args", {}),
result=action_data.get("result"),
success=action_data.get("success", True),
)
state.add_action(action)
# 更新轮次
if action.round_num and action.round_num > state.current_round:
state.current_round = action.round_num
except json.JSONDecodeError:
pass
return f.tell()
except Exception as e:
logger.warning(f"读取动作日志失败: {log_path}, error={e}")
return position
@classmethod
def stop_simulation(cls, simulation_id: str) -> SimulationRunState:
"""停止模拟"""
state = cls.get_run_state(simulation_id)
if not state:
raise ValueError(f"模拟不存在: {simulation_id}")
if state.runner_status not in [RunnerStatus.RUNNING, RunnerStatus.PAUSED]:
raise ValueError(f"模拟未在运行: {simulation_id}, status={state.runner_status}")
state.runner_status = RunnerStatus.STOPPING
cls._save_run_state(state)
# 终止进程
process = cls._processes.get(simulation_id)
if process and process.poll() is None:
try:
# 使用进程组 ID 终止整个进程组(包括所有子进程)
# 由于使用了 start_new_session=True进程组 ID 等于主进程 PID
pgid = os.getpgid(process.pid)
logger.info(f"终止进程组: simulation={simulation_id}, pgid={pgid}")
# 先发送 SIGTERM 给整个进程组
os.killpg(pgid, signal.SIGTERM)
try:
process.wait(timeout=10)
except subprocess.TimeoutExpired:
# 如果 10 秒后还没结束,强制发送 SIGKILL
logger.warning(f"进程组未响应 SIGTERM强制终止: {simulation_id}")
os.killpg(pgid, signal.SIGKILL)
process.wait(timeout=5)
except ProcessLookupError:
# 进程已经不存在
pass
except Exception as e:
logger.error(f"终止进程组失败: {simulation_id}, error={e}")
# 回退到直接终止进程
try:
process.terminate()
process.wait(timeout=5)
except Exception:
process.kill()
state.runner_status = RunnerStatus.STOPPED
state.twitter_running = False
state.reddit_running = False
state.completed_at = datetime.now().isoformat()
cls._save_run_state(state)
logger.info(f"模拟已停止: {simulation_id}")
return state
@classmethod
def get_actions(
cls,
simulation_id: str,
limit: int = 100,
offset: int = 0,
platform: Optional[str] = None,
agent_id: Optional[int] = None,
round_num: Optional[int] = None
) -> List[AgentAction]:
"""
获取动作历史
Args:
simulation_id: 模拟ID
limit: 返回数量限制
offset: 偏移量
platform: 过滤平台
agent_id: 过滤Agent
round_num: 过滤轮次
Returns:
动作列表
"""
sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id)
actions_log = os.path.join(sim_dir, "actions.jsonl")
if not os.path.exists(actions_log):
return []
actions = []
with open(actions_log, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
# 过滤
if platform and data.get("platform") != platform:
continue
if agent_id is not None and data.get("agent_id") != agent_id:
continue
if round_num is not None and data.get("round") != round_num:
continue
actions.append(AgentAction(
round_num=data.get("round", 0),
timestamp=data.get("timestamp", ""),
platform=data.get("platform", ""),
agent_id=data.get("agent_id", 0),
agent_name=data.get("agent_name", ""),
action_type=data.get("action_type", ""),
action_args=data.get("action_args", {}),
result=data.get("result"),
success=data.get("success", True),
))
except json.JSONDecodeError:
continue
# 按时间倒序排列
actions.reverse()
# 分页
return actions[offset:offset + limit]
@classmethod
def get_timeline(
cls,
simulation_id: str,
start_round: int = 0,
end_round: Optional[int] = None
) -> List[Dict[str, Any]]:
"""
获取模拟时间线(按轮次汇总)
Args:
simulation_id: 模拟ID
start_round: 起始轮次
end_round: 结束轮次
Returns:
每轮的汇总信息
"""
actions = cls.get_actions(simulation_id, limit=10000)
# 按轮次分组
rounds: Dict[int, Dict[str, Any]] = {}
for action in actions:
round_num = action.round_num
if round_num < start_round:
continue
if end_round is not None and round_num > end_round:
continue
if round_num not in rounds:
rounds[round_num] = {
"round_num": round_num,
"twitter_actions": 0,
"reddit_actions": 0,
"active_agents": set(),
"action_types": {},
"first_action_time": action.timestamp,
"last_action_time": action.timestamp,
}
r = rounds[round_num]
if action.platform == "twitter":
r["twitter_actions"] += 1
else:
r["reddit_actions"] += 1
r["active_agents"].add(action.agent_id)
r["action_types"][action.action_type] = r["action_types"].get(action.action_type, 0) + 1
r["last_action_time"] = action.timestamp
# 转换为列表
result = []
for round_num in sorted(rounds.keys()):
r = rounds[round_num]
result.append({
"round_num": round_num,
"twitter_actions": r["twitter_actions"],
"reddit_actions": r["reddit_actions"],
"total_actions": r["twitter_actions"] + r["reddit_actions"],
"active_agents_count": len(r["active_agents"]),
"active_agents": list(r["active_agents"]),
"action_types": r["action_types"],
"first_action_time": r["first_action_time"],
"last_action_time": r["last_action_time"],
})
return result
@classmethod
def get_agent_stats(cls, simulation_id: str) -> List[Dict[str, Any]]:
"""
获取每个Agent的统计信息
Returns:
Agent统计列表
"""
actions = cls.get_actions(simulation_id, limit=10000)
agent_stats: Dict[int, Dict[str, Any]] = {}
for action in actions:
agent_id = action.agent_id
if agent_id not in agent_stats:
agent_stats[agent_id] = {
"agent_id": agent_id,
"agent_name": action.agent_name,
"total_actions": 0,
"twitter_actions": 0,
"reddit_actions": 0,
"action_types": {},
"first_action_time": action.timestamp,
"last_action_time": action.timestamp,
}
stats = agent_stats[agent_id]
stats["total_actions"] += 1
if action.platform == "twitter":
stats["twitter_actions"] += 1
else:
stats["reddit_actions"] += 1
stats["action_types"][action.action_type] = stats["action_types"].get(action.action_type, 0) + 1
stats["last_action_time"] = action.timestamp
# 按总动作数排序
result = sorted(agent_stats.values(), key=lambda x: x["total_actions"], reverse=True)
return result
@classmethod
def cleanup_all_simulations(cls):
"""
清理所有运行中的模拟进程
在服务器关闭时调用,确保所有子进程被终止
"""
logger.info("正在清理所有模拟进程...")
# 复制字典以避免在迭代时修改
processes = list(cls._processes.items())
for simulation_id, process in processes:
try:
if process.poll() is None: # 进程仍在运行
logger.info(f"终止模拟进程: {simulation_id}, pid={process.pid}")
try:
# 使用进程组终止(包括所有子进程)
pgid = os.getpgid(process.pid)
os.killpg(pgid, signal.SIGTERM)
try:
process.wait(timeout=5)
except subprocess.TimeoutExpired:
logger.warning(f"进程组未响应 SIGTERM强制终止: {simulation_id}")
os.killpg(pgid, signal.SIGKILL)
process.wait(timeout=5)
except (ProcessLookupError, OSError):
# 进程可能已经不存在,尝试直接终止
try:
process.terminate()
process.wait(timeout=3)
except Exception:
process.kill()
# 更新状态
state = cls.get_run_state(simulation_id)
if state:
state.runner_status = RunnerStatus.STOPPED
state.twitter_running = False
state.reddit_running = False
state.completed_at = datetime.now().isoformat()
state.error = "服务器关闭,模拟被终止"
cls._save_run_state(state)
except Exception as e:
logger.error(f"清理进程失败: {simulation_id}, error={e}")
# 清理文件句柄
for simulation_id, file_handle in list(cls._stdout_files.items()):
try:
if file_handle:
file_handle.close()
except Exception:
pass
cls._stdout_files.clear()
for simulation_id, file_handle in list(cls._stderr_files.items()):
try:
if file_handle:
file_handle.close()
except Exception:
pass
cls._stderr_files.clear()
# 清理内存中的状态
cls._processes.clear()
cls._action_queues.clear()
logger.info("模拟进程清理完成")
@classmethod
def register_cleanup(cls):
"""
注册清理函数
在 Flask 应用启动时调用,确保服务器关闭时清理所有模拟进程
"""
global _cleanup_registered
if _cleanup_registered:
return
# 保存原有的信号处理器
original_sigint = signal.getsignal(signal.SIGINT)
original_sigterm = signal.getsignal(signal.SIGTERM)
def cleanup_handler(signum=None, frame=None):
"""信号处理器:先清理模拟进程,再调用原处理器"""
logger.info(f"收到信号 {signum},开始清理...")
cls.cleanup_all_simulations()
# 调用原有的信号处理器,让 Flask 正常退出
if signum == signal.SIGINT and callable(original_sigint):
original_sigint(signum, frame)
elif signum == signal.SIGTERM and callable(original_sigterm):
original_sigterm(signum, frame)
else:
# 如果原处理器不可调用(如 SIG_DFL则使用默认行为
raise KeyboardInterrupt
# 注册 atexit 处理器(作为备用)
atexit.register(cls.cleanup_all_simulations)
# 注册信号处理器(仅在主线程中)
try:
# SIGTERM: kill 命令默认信号
signal.signal(signal.SIGTERM, cleanup_handler)
# SIGINT: Ctrl+C
signal.signal(signal.SIGINT, cleanup_handler)
except ValueError:
# 不在主线程中,只能使用 atexit
logger.warning("无法注册信号处理器(不在主线程),仅使用 atexit")
_cleanup_registered = True
@classmethod
def get_running_simulations(cls) -> List[str]:
"""
获取所有正在运行的模拟ID列表
"""
running = []
for sim_id, process in cls._processes.items():
if process.poll() is None:
running.append(sim_id)
return running