Enhance signal handling and suppress warnings in simulation scripts

- Added signal handling to gracefully manage shutdown events across simulation scripts, ensuring proper cleanup of resources. - Introduced a global shutdown event to allow simulations to respond to termination signals, improving robustness. - Suppressed warnings related to multiprocessing resource tracking to avoid unnecessary log clutter during execution. - Updated cleanup logic in `SimulationRunner` and `ZepGraphMemoryManager` to prevent redundant calls and ensure efficient resource management. - Enhanced logging to provide clearer feedback during shutdown processes, improving traceability.
2025-12-09 00:37:12 +08:00
parent 3f750ffda2
commit 91eb73ae44
7 changed files with 262 additions and 23 deletions
--- a/backend/app/services/simulation_runner.py
+++ b/backend/app/services/simulation_runner.py
@@ -854,6 +854,9 @@ class SimulationRunner:
        
        return result
    
+    # 防止重复清理的标志
+    _cleanup_done = False
+    
    @classmethod
    def cleanup_all_simulations(cls):
        """
@@ -861,12 +864,23 @@ class SimulationRunner:
        
        在服务器关闭时调用，确保所有子进程被终止
        """
+        # 防止重复清理
+        if cls._cleanup_done:
+            return
+        cls._cleanup_done = True
+        
+        # 检查是否有内容需要清理（避免空进程的进程打印无用日志）
+        has_processes = bool(cls._processes)
+        has_updaters = bool(cls._graph_memory_enabled)
+        
+        if not has_processes and not has_updaters:
+            return  # 没有需要清理的内容，静默返回
+        
        logger.info("正在清理所有模拟进程...")
        
-        # 首先停止所有图谱记忆更新器
+        # 首先停止所有图谱记忆更新器（stop_all 内部会打印日志）
        try:
            ZepGraphMemoryManager.stop_all()
-            logger.info("已停止所有图谱记忆更新器")
        except Exception as e:
            logger.error(f"停止图谱记忆更新器失败: {e}")
        cls._graph_memory_enabled.clear()
@@ -899,7 +913,7 @@ class SimulationRunner:
                        except Exception:
                            process.kill()
                    
-                    # 更新状态
+                    # 更新 run_state.json
                    state = cls.get_run_state(simulation_id)
                    if state:
                        state.runner_status = RunnerStatus.STOPPED
@@ -908,6 +922,24 @@ class SimulationRunner:
                        state.completed_at = datetime.now().isoformat()
                        state.error = "服务器关闭，模拟被终止"
                        cls._save_run_state(state)
+                    
+                    # 同时更新 state.json，将状态设为 stopped
+                    try:
+                        sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id)
+                        state_file = os.path.join(sim_dir, "state.json")
+                        logger.info(f"尝试更新 state.json: {state_file}")
+                        if os.path.exists(state_file):
+                            with open(state_file, 'r', encoding='utf-8') as f:
+                                state_data = json.load(f)
+                            state_data['status'] = 'stopped'
+                            state_data['updated_at'] = datetime.now().isoformat()
+                            with open(state_file, 'w', encoding='utf-8') as f:
+                                json.dump(state_data, f, indent=2, ensure_ascii=False)
+                            logger.info(f"已更新 state.json 状态为 stopped: {simulation_id}")
+                        else:
+                            logger.warning(f"state.json 不存在: {state_file}")
+                    except Exception as state_err:
+                        logger.warning(f"更新 state.json 失败: {simulation_id}, error={state_err}")
                        
            except Exception as e:
                logger.error(f"清理进程失败: {simulation_id}, error={e}")
@@ -947,13 +979,26 @@ class SimulationRunner:
        if _cleanup_registered:
            return
        
+        # Flask debug 模式下，只在 reloader 子进程中注册清理（实际运行应用的进程）
+        # WERKZEUG_RUN_MAIN=true 表示是 reloader 子进程
+        # 如果不是 debug 模式，则没有这个环境变量，也需要注册
+        is_reloader_process = os.environ.get('WERKZEUG_RUN_MAIN') == 'true'
+        is_debug_mode = os.environ.get('FLASK_DEBUG') == '1' or os.environ.get('WERKZEUG_RUN_MAIN') is not None
+        
+        # 在 debug 模式下，只在 reloader 子进程中注册；非 debug 模式下始终注册
+        if is_debug_mode and not is_reloader_process:
+            _cleanup_registered = True  # 标记已注册，防止子进程再次尝试
+            return
+        
        # 保存原有的信号处理器
        original_sigint = signal.getsignal(signal.SIGINT)
        original_sigterm = signal.getsignal(signal.SIGTERM)
        
        def cleanup_handler(signum=None, frame=None):
            """信号处理器：先清理模拟进程，再调用原处理器"""
-            logger.info(f"收到信号 {signum}，开始清理...")
+            # 只有在有进程需要清理时才打印日志
+            if cls._processes or cls._graph_memory_enabled:
+                logger.info(f"收到信号 {signum}，开始清理...")
            cls.cleanup_all_simulations()
            
            # 调用原有的信号处理器，让 Flask 正常退出
--- a/backend/app/services/zep_graph_memory_updater.py
+++ b/backend/app/services/zep_graph_memory_updater.py
@@ -506,16 +506,25 @@ class ZepGraphMemoryManager:
                del cls._updaters[simulation_id]
                logger.info(f"已停止图谱记忆更新器: simulation_id={simulation_id}")
    
+    # 防止 stop_all 重复调用的标志
+    _stop_all_done = False
+    
    @classmethod
    def stop_all(cls):
        """停止所有更新器"""
+        # 防止重复调用
+        if cls._stop_all_done:
+            return
+        cls._stop_all_done = True
+        
        with cls._lock:
-            for simulation_id, updater in list(cls._updaters.items()):
-                try:
-                    updater.stop()
-                except Exception as e:
-                    logger.error(f"停止更新器失败: simulation_id={simulation_id}, error={e}")
-            cls._updaters.clear()
+            if cls._updaters:
+                for simulation_id, updater in list(cls._updaters.items()):
+                    try:
+                        updater.stop()
+                    except Exception as e:
+                        logger.error(f"停止更新器失败: simulation_id={simulation_id}, error={e}")
+                cls._updaters.clear()
            logger.info("已停止所有图谱记忆更新器")
    
    @classmethod