Refactor simulation management and enhance logging capabilities

- Updated simulation preparation checks to exclude script files from the required files list, improving clarity on file management. - Implemented a robust retry mechanism for Zep API calls in the ZepEntityReader service, enhancing reliability. - Enhanced logging in simulation scripts to provide clearer insights into the simulation process and errors. - Updated simulation runner to manage stdout and stderr logs more effectively, ensuring better error tracking. - Improved profile generation to standardize gender fields and ensure all required fields are populated correctly.
2025-12-02 14:25:53 +08:00
parent af5c235695
commit 3cc5e3f479
8 changed files with 595 additions and 165 deletions
--- a/backend/app/services/simulation_runner.py
+++ b/backend/app/services/simulation_runner.py
@@ -182,11 +182,19 @@ class SimulationRunner:
        '../../uploads/simulations'
    )
    
+    # 脚本目录
+    SCRIPTS_DIR = os.path.join(
+        os.path.dirname(__file__),
+        '../../scripts'
+    )
+    
    # 内存中的运行状态
    _run_states: Dict[str, SimulationRunState] = {}
    _processes: Dict[str, subprocess.Popen] = {}
    _action_queues: Dict[str, Queue] = {}
    _monitor_threads: Dict[str, threading.Thread] = {}
+    _stdout_files: Dict[str, Any] = {}  # 存储 stdout 文件句柄
+    _stderr_files: Dict[str, Any] = {}  # 存储 stderr 文件句柄
    
    @classmethod
    def get_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]:
@@ -310,7 +318,7 @@ class SimulationRunner:
        
        cls._save_run_state(state)
        
-        # 确定运行哪个脚本
+        # 确定运行哪个脚本（脚本位于 backend/scripts/ 目录）
        if platform == "twitter":
            script_name = "run_twitter_simulation.py"
            state.twitter_running = True
@@ -322,7 +330,7 @@ class SimulationRunner:
            state.twitter_running = True
            state.reddit_running = True
        
-        script_path = os.path.join(sim_dir, script_name)
+        script_path = os.path.join(cls.SCRIPTS_DIR, script_name)
        
        if not os.path.exists(script_path):
            raise ValueError(f"脚本不存在: {script_path}")
@@ -333,24 +341,36 @@ class SimulationRunner:
        
        # 启动模拟进程
        try:
-            # 构建运行命令
+            # 构建运行命令，使用完整路径
+            action_log_path = os.path.join(sim_dir, "actions.jsonl")
+            
            cmd = [
                sys.executable,  # Python解释器
                script_path,
-                "--config", "simulation_config.json",
-                "--action-log", "actions.jsonl",  # 动作日志文件
+                "--config", config_path,  # 使用完整配置文件路径
+                "--action-log", action_log_path,  # 动作日志文件完整路径
            ]
            
-            # 设置工作目录为模拟目录
+            # 创建输出日志文件，避免 stdout/stderr 管道缓冲区满导致进程阻塞
+            stdout_log_path = os.path.join(sim_dir, "simulation_stdout.log")
+            stderr_log_path = os.path.join(sim_dir, "simulation_stderr.log")
+            stdout_file = open(stdout_log_path, 'w', encoding='utf-8')
+            stderr_file = open(stderr_log_path, 'w', encoding='utf-8')
+            
+            # 设置工作目录为模拟目录（数据库等文件会生成在此）
            process = subprocess.Popen(
                cmd,
                cwd=sim_dir,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
+                stdout=stdout_file,
+                stderr=stderr_file,
                text=True,
                bufsize=1,
            )
            
+            # 保存文件句柄以便后续关闭
+            cls._stdout_files[simulation_id] = stdout_file
+            cls._stderr_files[simulation_id] = stderr_file
+            
            state.process_pid = process.pid
            state.runner_status = RunnerStatus.RUNNING
            cls._processes[simulation_id] = process
@@ -434,8 +454,16 @@ class SimulationRunner:
                logger.info(f"模拟完成: {simulation_id}")
            else:
                state.runner_status = RunnerStatus.FAILED
-                stderr = process.stderr.read() if process.stderr else ""
-                state.error = f"进程退出码: {exit_code}, 错误: {stderr[:500]}"
+                # 从 stderr 日志文件读取错误信息
+                stderr_log_path = os.path.join(sim_dir, "simulation_stderr.log")
+                stderr = ""
+                try:
+                    if os.path.exists(stderr_log_path):
+                        with open(stderr_log_path, 'r', encoding='utf-8') as f:
+                            stderr = f.read()
+                except Exception:
+                    pass
+                state.error = f"进程退出码: {exit_code}, 错误: {stderr[-1000:]}"  # 取最后1000字符
                logger.error(f"模拟失败: {simulation_id}, error={state.error}")
            
            state.twitter_running = False
@@ -449,9 +477,23 @@ class SimulationRunner:
            cls._save_run_state(state)
        
        finally:
-            # 清理
+            # 清理进程资源
            cls._processes.pop(simulation_id, None)
            cls._action_queues.pop(simulation_id, None)
+            
+            # 关闭日志文件句柄
+            if simulation_id in cls._stdout_files:
+                try:
+                    cls._stdout_files[simulation_id].close()
+                except Exception:
+                    pass
+                cls._stdout_files.pop(simulation_id, None)
+            if simulation_id in cls._stderr_files:
+                try:
+                    cls._stderr_files[simulation_id].close()
+                except Exception:
+                    pass
+                cls._stderr_files.pop(simulation_id, None)
    
    @classmethod
    def stop_simulation(cls, simulation_id: str) -> SimulationRunState: