Fix corrupted LLM-generated hour arrays in simulation config

The LLM sometimes generates arrays like [19202122] instead of [19,20,21,22]. Add _sanitize_hours() to validate and fix these, falling back to defaults when arrays contain single large numbers or strings. Also add round-level debug logging.
2026-03-13 20:32:33 +07:00 · 2026-03-13 20:32:33 +07:00 · 5e206bdd84
commit 5e206bdd84
parent 0ff30457a0
2 changed files with 23 additions and 9 deletions
--- a/backend/app/services/simulation_config_generator.py
+++ b/backend/app/services/simulation_config_generator.py
@ -24,6 +24,19 @@ from .zep_entity_reader import EntityNode, ZepEntityReader

 logger = get_logger('mirofish.simulation_config')

+
+def _sanitize_hours(val, default):
+    """Fix LLM-generated hour arrays that got concatenated into single values."""
+    if not isinstance(val, list) or not val:
+        return default
+    if len(val) == 1:
+        item = val[0]
+        if isinstance(item, str) and len(item) > 2:
+            return default
+        if isinstance(item, (int, float)) and item > 23:
+            return default
+    return [h for h in val if isinstance(h, int) and 0 <= h <= 23] or default
+
 # 中国作息时间配置（北京时间）
 CHINA_TIMEZONE_CONFIG = {
    # 深夜时段（几乎无人活动）
@ -631,12 +644,12 @@ Field descriptions:
            minutes_per_round=result.get("minutes_per_round", 60),  # 默认每轮1小时
            agents_per_hour_min=agents_per_hour_min,
            agents_per_hour_max=agents_per_hour_max,
-            peak_hours=result.get("peak_hours", [19, 20, 21, 22]),
-            off_peak_hours=result.get("off_peak_hours", [0, 1, 2, 3, 4, 5]),
+            peak_hours=_sanitize_hours(result.get("peak_hours"), [19, 20, 21, 22]),
+            off_peak_hours=_sanitize_hours(result.get("off_peak_hours"), [0, 1, 2, 3, 4, 5]),
            off_peak_activity_multiplier=0.05,  # 凌晨几乎无人
-            morning_hours=result.get("morning_hours", [6, 7, 8]),
+            morning_hours=_sanitize_hours(result.get("morning_hours"), [6, 7, 8]),
            morning_activity_multiplier=0.4,
-            work_hours=result.get("work_hours", list(range(9, 19))),
+            work_hours=_sanitize_hours(result.get("work_hours"), list(range(9, 19))),
            work_activity_multiplier=0.7,
            peak_activity_multiplier=1.5
        )
@ -890,7 +903,7 @@ Return JSON format (no markdown):
                activity_level=cfg.get("activity_level", 0.5),
                posts_per_hour=cfg.get("posts_per_hour", 0.5),
                comments_per_hour=cfg.get("comments_per_hour", 1.0),
-                active_hours=cfg.get("active_hours", list(range(9, 23))),
+                active_hours=_sanitize_hours(cfg.get("active_hours"), list(range(9, 23))),
                response_delay_min=cfg.get("response_delay_min", 5),
                response_delay_max=cfg.get("response_delay_max", 60),
                sentiment_bias=cfg.get("sentiment_bias", 0.0),
--- a/backend/scripts/run_reddit_simulation.py
+++ b/backend/scripts/run_reddit_simulation.py
@ -620,7 +620,7 @@ class RedditSimulationRunner:
                print(f"  已发布 {len(initial_actions)} 条初始帖子")
        
        # 主模拟循环
-        print("\n开始模拟循环...")
+        print("\n开始模拟循环...", flush=True)
        start_time = datetime.now()

        for round_num in range(total_rounds):
@ -632,6 +632,7 @@ class RedditSimulationRunner:
                self.env, simulated_hour, round_num
            )

+            print(f"  Round {round_num+1}/{total_rounds}: hour={simulated_hour}, active={len(active_agents)}", flush=True)
            if not active_agents:
                continue