Enhance Report Agent and Zep Tools with new search functionalities

- Introduced new core search tools in the Report Agent: InsightForge for deep insights, PanoramaSearch for comprehensive views, and QuickSearch for rapid queries.
- Updated the Report Agent to prioritize tool usage for data retrieval, ensuring all report content is based on simulation results rather than internal knowledge.
- Enhanced the ZepToolsService with methods for InsightForge and PanoramaSearch, allowing for multi-dimensional queries and historical data retrieval.
- Improved documentation to reflect the new functionalities and usage guidelines for the Report Agent and Zep tools.
This commit is contained in:
666ghj
2025-12-09 16:06:53 +08:00
parent 5ece3f670b
commit a0c97b1344
2 changed files with 889 additions and 119 deletions

View File

@@ -1,16 +1,23 @@
"""
Zep检索工具服务
封装图谱搜索、节点读取、边查询等工具供Report Agent使用
核心检索工具(优化后):
1. InsightForge深度洞察检索- 最强大的混合检索,自动生成子问题并多维度检索
2. PanoramaSearch广度搜索- 获取全貌,包括过期内容
3. QuickSearch简单搜索- 快速检索
"""
import time
import json
from typing import Dict, Any, List, Optional
from dataclasses import dataclass
from dataclasses import dataclass, field
from zep_cloud.client import Zep
from ..config import Config
from ..utils.logger import get_logger
from ..utils.llm_client import LLMClient
logger = get_logger('mirofish.zep_tools')
@@ -79,6 +86,11 @@ class EdgeInfo:
target_node_uuid: str
source_node_name: Optional[str] = None
target_node_name: Optional[str] = None
# 时间信息
created_at: Optional[str] = None
valid_at: Optional[str] = None
invalid_at: Optional[str] = None
expired_at: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return {
@@ -88,42 +100,231 @@ class EdgeInfo:
"source_node_uuid": self.source_node_uuid,
"target_node_uuid": self.target_node_uuid,
"source_node_name": self.source_node_name,
"target_node_name": self.target_node_name
"target_node_name": self.target_node_name,
"created_at": self.created_at,
"valid_at": self.valid_at,
"invalid_at": self.invalid_at,
"expired_at": self.expired_at
}
def to_text(self, include_temporal: bool = False) -> str:
"""转换为文本格式"""
source = self.source_node_name or self.source_node_uuid[:8]
target = self.target_node_name or self.target_node_uuid[:8]
base_text = f"关系: {source} --[{self.name}]--> {target}\n事实: {self.fact}"
if include_temporal:
valid_at = self.valid_at or "未知"
invalid_at = self.invalid_at or "至今"
base_text += f"\n时效: {valid_at} - {invalid_at}"
if self.expired_at:
base_text += f" (已过期: {self.expired_at})"
return base_text
@property
def is_expired(self) -> bool:
"""是否已过期"""
return self.expired_at is not None
@property
def is_invalid(self) -> bool:
"""是否已失效"""
return self.invalid_at is not None
@dataclass
class InsightForgeResult:
"""
深度洞察检索结果 (InsightForge)
包含多个子问题的检索结果,以及综合分析
"""
query: str
simulation_requirement: str
sub_queries: List[str]
# 各维度检索结果
semantic_facts: List[str] = field(default_factory=list) # 语义搜索结果
entity_insights: List[Dict[str, Any]] = field(default_factory=list) # 实体洞察
relationship_chains: List[str] = field(default_factory=list) # 关系链
# 统计信息
total_facts: int = 0
total_entities: int = 0
total_relationships: int = 0
def to_dict(self) -> Dict[str, Any]:
return {
"query": self.query,
"simulation_requirement": self.simulation_requirement,
"sub_queries": self.sub_queries,
"semantic_facts": self.semantic_facts,
"entity_insights": self.entity_insights,
"relationship_chains": self.relationship_chains,
"total_facts": self.total_facts,
"total_entities": self.total_entities,
"total_relationships": self.total_relationships
}
def to_text(self) -> str:
"""转换为详细的文本格式供LLM理解"""
text_parts = [
f"## 深度洞察检索结果",
f"原始问题: {self.query}",
f"模拟需求: {self.simulation_requirement}",
f"\n### 检索统计",
f"- 相关事实: {self.total_facts}",
f"- 涉及实体: {self.total_entities}",
f"- 关系链: {self.total_relationships}"
]
# 子问题
if self.sub_queries:
text_parts.append(f"\n### 分析的子问题")
for i, sq in enumerate(self.sub_queries, 1):
text_parts.append(f"{i}. {sq}")
# 语义搜索结果
if self.semantic_facts:
text_parts.append(f"\n### 【关键事实】(请在报告中引用这些原文)")
for i, fact in enumerate(self.semantic_facts, 1):
text_parts.append(f"{i}. \"{fact}\"")
# 实体洞察
if self.entity_insights:
text_parts.append(f"\n### 【核心实体】")
for entity in self.entity_insights:
text_parts.append(f"- **{entity.get('name', '未知')}** ({entity.get('type', '实体')})")
if entity.get('summary'):
text_parts.append(f" 摘要: \"{entity.get('summary')}\"")
if entity.get('related_facts'):
text_parts.append(f" 相关事实: {len(entity.get('related_facts', []))}")
# 关系链
if self.relationship_chains:
text_parts.append(f"\n### 【关系链】")
for chain in self.relationship_chains:
text_parts.append(f"- {chain}")
return "\n".join(text_parts)
@dataclass
class PanoramaResult:
"""
广度搜索结果 (Panorama)
包含所有相关信息,包括过期内容
"""
query: str
# 全部节点
all_nodes: List[NodeInfo] = field(default_factory=list)
# 全部边(包括过期的)
all_edges: List[EdgeInfo] = field(default_factory=list)
# 当前有效的事实
active_facts: List[str] = field(default_factory=list)
# 已过期/失效的事实(历史记录)
historical_facts: List[str] = field(default_factory=list)
# 统计
total_nodes: int = 0
total_edges: int = 0
active_count: int = 0
historical_count: int = 0
def to_dict(self) -> Dict[str, Any]:
return {
"query": self.query,
"all_nodes": [n.to_dict() for n in self.all_nodes],
"all_edges": [e.to_dict() for e in self.all_edges],
"active_facts": self.active_facts,
"historical_facts": self.historical_facts,
"total_nodes": self.total_nodes,
"total_edges": self.total_edges,
"active_count": self.active_count,
"historical_count": self.historical_count
}
def to_text(self) -> str:
"""转换为文本格式"""
source = self.source_node_name or self.source_node_uuid[:8]
target = self.target_node_name or self.target_node_uuid[:8]
return f"关系: {source} --[{self.name}]--> {target}\n事实: {self.fact}"
text_parts = [
f"## 广度搜索结果(全貌视图)",
f"查询: {self.query}",
f"\n### 统计信息",
f"- 总节点数: {self.total_nodes}",
f"- 总边数: {self.total_edges}",
f"- 当前有效事实: {self.active_count}",
f"- 历史/过期事实: {self.historical_count}"
]
# 当前有效的事实
if self.active_facts:
text_parts.append(f"\n### 【当前有效事实】(模拟结果原文)")
for i, fact in enumerate(self.active_facts[:30], 1):
text_parts.append(f"{i}. \"{fact}\"")
if len(self.active_facts) > 30:
text_parts.append(f"... 还有 {len(self.active_facts) - 30}")
# 历史/过期事实
if self.historical_facts:
text_parts.append(f"\n### 【历史/过期事实】(演变过程记录)")
for i, fact in enumerate(self.historical_facts[:20], 1):
text_parts.append(f"{i}. \"{fact}\"")
if len(self.historical_facts) > 20:
text_parts.append(f"... 还有 {len(self.historical_facts) - 20}")
# 关键实体
if self.all_nodes:
text_parts.append(f"\n### 【涉及实体】")
for node in self.all_nodes[:20]:
entity_type = next((l for l in node.labels if l not in ["Entity", "Node"]), "实体")
text_parts.append(f"- **{node.name}** ({entity_type})")
if len(self.all_nodes) > 20:
text_parts.append(f"... 还有 {len(self.all_nodes) - 20} 个实体")
return "\n".join(text_parts)
class ZepToolsService:
"""
Zep检索工具服务
提供多种图谱检索工具可以被Report Agent调用
1. search_graph - 图谱语义搜索
2. get_all_nodes - 获取图谱所有节点
3. get_all_edges - 获取图谱所有边
4. get_node_detail - 获取节点详细信息
5. get_node_edges - 获取节点相关的边
6. get_entities_by_type - 按类型获取实体
7. get_entity_summary - 获取实体的关系摘要
【核心检索工具 - 优化后】
1. insight_forge - 深度洞察检索(最强大,自动生成子问题,多维度检索)
2. panorama_search - 广度搜索(获取全貌,包括过期内容)
3. quick_search - 简单搜索(快速检索)
【基础工具】
- search_graph - 图谱语义搜索
- get_all_nodes - 获取图谱所有节点
- get_all_edges - 获取图谱所有边(含时间信息)
- get_node_detail - 获取节点详细信息
- get_node_edges - 获取节点相关的边
- get_entities_by_type - 按类型获取实体
- get_entity_summary - 获取实体的关系摘要
"""
# 重试配置
MAX_RETRIES = 3
RETRY_DELAY = 2.0
def __init__(self, api_key: Optional[str] = None):
def __init__(self, api_key: Optional[str] = None, llm_client: Optional[LLMClient] = None):
self.api_key = api_key or Config.ZEP_API_KEY
if not self.api_key:
raise ValueError("ZEP_API_KEY 未配置")
self.client = Zep(api_key=self.api_key)
# LLM客户端用于InsightForge生成子问题
self._llm_client = llm_client
logger.info("ZepToolsService 初始化完成")
@property
def llm(self) -> LLMClient:
"""延迟初始化LLM客户端"""
if self._llm_client is None:
self._llm_client = LLMClient()
return self._llm_client
def _call_with_retry(self, func, operation_name: str, max_retries: int = None):
"""带重试机制的API调用"""
max_retries = max_retries or self.MAX_RETRIES
@@ -363,15 +564,16 @@ class ZepToolsService:
logger.info(f"获取到 {len(result)} 个节点")
return result
def get_all_edges(self, graph_id: str) -> List[EdgeInfo]:
def get_all_edges(self, graph_id: str, include_temporal: bool = True) -> List[EdgeInfo]:
"""
获取图谱的所有边
获取图谱的所有边(包含时间信息)
Args:
graph_id: 图谱ID
include_temporal: 是否包含时间信息默认True
Returns:
边列表
边列表包含created_at, valid_at, invalid_at, expired_at
"""
logger.info(f"获取图谱 {graph_id} 的所有边...")
@@ -382,13 +584,22 @@ class ZepToolsService:
result = []
for edge in edges:
result.append(EdgeInfo(
edge_info = EdgeInfo(
uuid=getattr(edge, 'uuid_', None) or getattr(edge, 'uuid', ''),
name=edge.name or "",
fact=edge.fact or "",
source_node_uuid=edge.source_node_uuid or "",
target_node_uuid=edge.target_node_uuid or ""
))
)
# 添加时间信息
if include_temporal:
edge_info.created_at = getattr(edge, 'created_at', None)
edge_info.valid_at = getattr(edge, 'valid_at', None)
edge_info.invalid_at = getattr(edge, 'invalid_at', None)
edge_info.expired_at = getattr(edge, 'expired_at', None)
result.append(edge_info)
logger.info(f"获取到 {len(result)} 条边")
return result
@@ -619,3 +830,322 @@ class ZepToolsService:
"entities": entities[:limit], # 限制数量
"total_entities": len(entities)
}
# ========== 核心检索工具(优化后) ==========
def insight_forge(
self,
graph_id: str,
query: str,
simulation_requirement: str,
report_context: str = "",
max_sub_queries: int = 5
) -> InsightForgeResult:
"""
【InsightForge - 深度洞察检索】
最强大的混合检索函数,自动分解问题并多维度检索:
1. 使用LLM将问题分解为多个子问题
2. 对每个子问题进行语义搜索
3. 提取相关实体并获取其详细信息
4. 追踪关系链
5. 整合所有结果,生成深度洞察
Args:
graph_id: 图谱ID
query: 用户问题
simulation_requirement: 模拟需求描述
report_context: 报告上下文(可选,用于更精准的子问题生成)
max_sub_queries: 最大子问题数量
Returns:
InsightForgeResult: 深度洞察检索结果
"""
logger.info(f"InsightForge 深度洞察检索: {query[:50]}...")
result = InsightForgeResult(
query=query,
simulation_requirement=simulation_requirement,
sub_queries=[]
)
# Step 1: 使用LLM生成子问题
sub_queries = self._generate_sub_queries(
query=query,
simulation_requirement=simulation_requirement,
report_context=report_context,
max_queries=max_sub_queries
)
result.sub_queries = sub_queries
logger.info(f"生成 {len(sub_queries)} 个子问题")
# Step 2: 对每个子问题进行语义搜索
all_facts = []
all_edges = []
seen_facts = set()
for sub_query in sub_queries:
search_result = self.search_graph(
graph_id=graph_id,
query=sub_query,
limit=15,
scope="edges"
)
for fact in search_result.facts:
if fact not in seen_facts:
all_facts.append(fact)
seen_facts.add(fact)
all_edges.extend(search_result.edges)
# 对原始问题也进行搜索
main_search = self.search_graph(
graph_id=graph_id,
query=query,
limit=20,
scope="edges"
)
for fact in main_search.facts:
if fact not in seen_facts:
all_facts.append(fact)
seen_facts.add(fact)
result.semantic_facts = all_facts
result.total_facts = len(all_facts)
# Step 3: 提取相关实体并获取详细信息
all_nodes = self.get_all_nodes(graph_id)
node_map = {n.uuid: n for n in all_nodes}
# 从边中提取涉及的实体
entity_uuids = set()
for edge_data in all_edges:
if isinstance(edge_data, dict):
entity_uuids.add(edge_data.get('source_node_uuid', ''))
entity_uuids.add(edge_data.get('target_node_uuid', ''))
# 获取实体详情
entity_insights = []
for uuid in list(entity_uuids)[:30]: # 限制数量
if uuid in node_map:
node = node_map[uuid]
entity_type = next((l for l in node.labels if l not in ["Entity", "Node"]), "实体")
# 获取该实体相关的事实
related_facts = [
f for f in all_facts
if node.name.lower() in f.lower()
]
entity_insights.append({
"uuid": node.uuid,
"name": node.name,
"type": entity_type,
"summary": node.summary,
"related_facts": related_facts[:5]
})
result.entity_insights = entity_insights
result.total_entities = len(entity_insights)
# Step 4: 构建关系链
relationship_chains = []
for edge_data in all_edges[:20]:
if isinstance(edge_data, dict):
source_uuid = edge_data.get('source_node_uuid', '')
target_uuid = edge_data.get('target_node_uuid', '')
relation_name = edge_data.get('name', '')
source_name = node_map.get(source_uuid, NodeInfo('', '', [], '', {})).name or source_uuid[:8]
target_name = node_map.get(target_uuid, NodeInfo('', '', [], '', {})).name or target_uuid[:8]
chain = f"{source_name} --[{relation_name}]--> {target_name}"
if chain not in relationship_chains:
relationship_chains.append(chain)
result.relationship_chains = relationship_chains
result.total_relationships = len(relationship_chains)
logger.info(f"InsightForge完成: {result.total_facts}条事实, {result.total_entities}个实体, {result.total_relationships}条关系")
return result
def _generate_sub_queries(
self,
query: str,
simulation_requirement: str,
report_context: str = "",
max_queries: int = 5
) -> List[str]:
"""
使用LLM生成子问题
将复杂问题分解为多个可以独立检索的子问题
"""
system_prompt = """你是一个专业的问题分析专家。你的任务是将一个复杂问题分解为多个可以独立检索的子问题。
要求:
1. 每个子问题应该足够具体,可以在知识图谱中检索到相关信息
2. 子问题应该覆盖原问题的不同维度(如:谁、什么、为什么、怎么样、何时、何地)
3. 子问题应该与模拟场景相关
4. 返回JSON格式{"sub_queries": ["子问题1", "子问题2", ...]}"""
user_prompt = f"""模拟需求背景:
{simulation_requirement}
{f"报告上下文:{report_context[:500]}" if report_context else ""}
请将以下问题分解为{max_queries}个子问题:
{query}
返回JSON格式的子问题列表。"""
try:
response = self.llm.chat_json(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
temperature=0.3
)
sub_queries = response.get("sub_queries", [])
# 确保是字符串列表
return [str(sq) for sq in sub_queries[:max_queries]]
except Exception as e:
logger.warning(f"生成子问题失败: {str(e)},使用默认子问题")
# 降级:返回基于原问题的变体
return [
query,
f"{query} 的主要参与者",
f"{query} 的原因和影响",
f"{query} 的发展过程"
][:max_queries]
def panorama_search(
self,
graph_id: str,
query: str,
include_expired: bool = True,
limit: int = 50
) -> PanoramaResult:
"""
【PanoramaSearch - 广度搜索】
获取全貌视图,包括所有相关内容和历史/过期信息:
1. 获取所有相关节点
2. 获取所有边(包括已过期/失效的)
3. 分类整理当前有效和历史信息
这个工具适用于需要了解事件全貌、追踪演变过程的场景。
Args:
graph_id: 图谱ID
query: 搜索查询(用于相关性排序)
include_expired: 是否包含过期内容默认True
limit: 返回结果数量限制
Returns:
PanoramaResult: 广度搜索结果
"""
logger.info(f"PanoramaSearch 广度搜索: {query[:50]}...")
result = PanoramaResult(query=query)
# 获取所有节点
all_nodes = self.get_all_nodes(graph_id)
node_map = {n.uuid: n for n in all_nodes}
result.all_nodes = all_nodes
result.total_nodes = len(all_nodes)
# 获取所有边(包含时间信息)
all_edges = self.get_all_edges(graph_id, include_temporal=True)
result.all_edges = all_edges
result.total_edges = len(all_edges)
# 分类事实
active_facts = []
historical_facts = []
for edge in all_edges:
if not edge.fact:
continue
# 为事实添加实体名称
source_name = node_map.get(edge.source_node_uuid, NodeInfo('', '', [], '', {})).name or edge.source_node_uuid[:8]
target_name = node_map.get(edge.target_node_uuid, NodeInfo('', '', [], '', {})).name or edge.target_node_uuid[:8]
# 判断是否过期/失效
is_historical = edge.is_expired or edge.is_invalid
if is_historical:
# 历史/过期事实,添加时间标记
valid_at = edge.valid_at or "未知"
invalid_at = edge.invalid_at or edge.expired_at or "未知"
fact_with_time = f"[{valid_at} - {invalid_at}] {edge.fact}"
historical_facts.append(fact_with_time)
else:
# 当前有效事实
active_facts.append(edge.fact)
# 基于查询进行相关性排序
query_lower = query.lower()
keywords = [w.strip() for w in query_lower.replace(',', ' ').replace('', ' ').split() if len(w.strip()) > 1]
def relevance_score(fact: str) -> int:
fact_lower = fact.lower()
score = 0
if query_lower in fact_lower:
score += 100
for kw in keywords:
if kw in fact_lower:
score += 10
return score
# 排序并限制数量
active_facts.sort(key=relevance_score, reverse=True)
historical_facts.sort(key=relevance_score, reverse=True)
result.active_facts = active_facts[:limit]
result.historical_facts = historical_facts[:limit] if include_expired else []
result.active_count = len(active_facts)
result.historical_count = len(historical_facts)
logger.info(f"PanoramaSearch完成: {result.active_count}条有效, {result.historical_count}条历史")
return result
def quick_search(
self,
graph_id: str,
query: str,
limit: int = 10
) -> SearchResult:
"""
【QuickSearch - 简单搜索】
快速、轻量级的检索工具:
1. 直接调用Zep语义搜索
2. 返回最相关的结果
3. 适用于简单、直接的检索需求
Args:
graph_id: 图谱ID
query: 搜索查询
limit: 返回结果数量
Returns:
SearchResult: 搜索结果
"""
logger.info(f"QuickSearch 简单搜索: {query[:50]}...")
# 直接调用现有的search_graph方法
result = self.search_graph(
graph_id=graph_id,
query=query,
limit=limit,
scope="edges"
)
logger.info(f"QuickSearch完成: {result.total_count}条结果")
return result