Introduce Project ID for context management, finalizing the stateful API pipeline from file submission to graph construction.
This commit is contained in:
10
backend/app/services/__init__.py
Normal file
10
backend/app/services/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""
|
||||
业务服务模块
|
||||
"""
|
||||
|
||||
from .ontology_generator import OntologyGenerator
|
||||
from .graph_builder import GraphBuilderService
|
||||
from .text_processor import TextProcessor
|
||||
|
||||
__all__ = ['OntologyGenerator', 'GraphBuilderService', 'TextProcessor']
|
||||
|
||||
457
backend/app/services/graph_builder.py
Normal file
457
backend/app/services/graph_builder.py
Normal file
@@ -0,0 +1,457 @@
|
||||
"""
|
||||
图谱构建服务
|
||||
接口2:使用Zep API构建Standalone Graph
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import time
|
||||
import threading
|
||||
from typing import Dict, Any, List, Optional, Callable
|
||||
from dataclasses import dataclass
|
||||
|
||||
from zep_cloud.client import Zep
|
||||
from zep_cloud import EpisodeData, EntityEdgeSourceTarget
|
||||
|
||||
from ..config import Config
|
||||
from ..models.task import TaskManager, TaskStatus
|
||||
from .text_processor import TextProcessor
|
||||
|
||||
|
||||
@dataclass
|
||||
class GraphInfo:
|
||||
"""图谱信息"""
|
||||
graph_id: str
|
||||
node_count: int
|
||||
edge_count: int
|
||||
entity_types: List[str]
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"graph_id": self.graph_id,
|
||||
"node_count": self.node_count,
|
||||
"edge_count": self.edge_count,
|
||||
"entity_types": self.entity_types,
|
||||
}
|
||||
|
||||
|
||||
class GraphBuilderService:
|
||||
"""
|
||||
图谱构建服务
|
||||
负责调用Zep API构建知识图谱
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: Optional[str] = None):
|
||||
self.api_key = api_key or Config.ZEP_API_KEY
|
||||
if not self.api_key:
|
||||
raise ValueError("ZEP_API_KEY 未配置")
|
||||
|
||||
self.client = Zep(api_key=self.api_key)
|
||||
self.task_manager = TaskManager()
|
||||
|
||||
def build_graph_async(
|
||||
self,
|
||||
text: str,
|
||||
ontology: Dict[str, Any],
|
||||
graph_name: str = "MiroFish Graph",
|
||||
chunk_size: int = 500,
|
||||
chunk_overlap: int = 50,
|
||||
batch_size: int = 3
|
||||
) -> str:
|
||||
"""
|
||||
异步构建图谱
|
||||
|
||||
Args:
|
||||
text: 输入文本
|
||||
ontology: 本体定义(来自接口1的输出)
|
||||
graph_name: 图谱名称
|
||||
chunk_size: 文本块大小
|
||||
chunk_overlap: 块重叠大小
|
||||
batch_size: 每批发送的块数量
|
||||
|
||||
Returns:
|
||||
任务ID
|
||||
"""
|
||||
# 创建任务
|
||||
task_id = self.task_manager.create_task(
|
||||
task_type="graph_build",
|
||||
metadata={
|
||||
"graph_name": graph_name,
|
||||
"chunk_size": chunk_size,
|
||||
"text_length": len(text),
|
||||
}
|
||||
)
|
||||
|
||||
# 在后台线程中执行构建
|
||||
thread = threading.Thread(
|
||||
target=self._build_graph_worker,
|
||||
args=(task_id, text, ontology, graph_name, chunk_size, chunk_overlap, batch_size)
|
||||
)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
|
||||
return task_id
|
||||
|
||||
def _build_graph_worker(
|
||||
self,
|
||||
task_id: str,
|
||||
text: str,
|
||||
ontology: Dict[str, Any],
|
||||
graph_name: str,
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
batch_size: int
|
||||
):
|
||||
"""图谱构建工作线程"""
|
||||
try:
|
||||
self.task_manager.update_task(
|
||||
task_id,
|
||||
status=TaskStatus.PROCESSING,
|
||||
progress=5,
|
||||
message="开始构建图谱..."
|
||||
)
|
||||
|
||||
# 1. 创建图谱
|
||||
graph_id = self.create_graph(graph_name)
|
||||
self.task_manager.update_task(
|
||||
task_id,
|
||||
progress=10,
|
||||
message=f"图谱已创建: {graph_id}"
|
||||
)
|
||||
|
||||
# 2. 设置本体
|
||||
self.set_ontology(graph_id, ontology)
|
||||
self.task_manager.update_task(
|
||||
task_id,
|
||||
progress=15,
|
||||
message="本体已设置"
|
||||
)
|
||||
|
||||
# 3. 文本分块
|
||||
chunks = TextProcessor.split_text(text, chunk_size, chunk_overlap)
|
||||
total_chunks = len(chunks)
|
||||
self.task_manager.update_task(
|
||||
task_id,
|
||||
progress=20,
|
||||
message=f"文本已分割为 {total_chunks} 个块"
|
||||
)
|
||||
|
||||
# 4. 分批发送数据
|
||||
episode_uuids = self.add_text_batches(
|
||||
graph_id, chunks, batch_size,
|
||||
lambda msg, prog: self.task_manager.update_task(
|
||||
task_id,
|
||||
progress=20 + int(prog * 0.4), # 20-60%
|
||||
message=msg
|
||||
)
|
||||
)
|
||||
|
||||
# 5. 等待Zep处理完成
|
||||
self.task_manager.update_task(
|
||||
task_id,
|
||||
progress=60,
|
||||
message="等待Zep处理数据..."
|
||||
)
|
||||
|
||||
self._wait_for_episodes(
|
||||
episode_uuids,
|
||||
lambda msg, prog: self.task_manager.update_task(
|
||||
task_id,
|
||||
progress=60 + int(prog * 0.3), # 60-90%
|
||||
message=msg
|
||||
)
|
||||
)
|
||||
|
||||
# 6. 获取图谱信息
|
||||
self.task_manager.update_task(
|
||||
task_id,
|
||||
progress=90,
|
||||
message="获取图谱信息..."
|
||||
)
|
||||
|
||||
graph_info = self._get_graph_info(graph_id)
|
||||
|
||||
# 完成
|
||||
self.task_manager.complete_task(task_id, {
|
||||
"graph_id": graph_id,
|
||||
"graph_info": graph_info.to_dict(),
|
||||
"chunks_processed": total_chunks,
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
error_msg = f"{str(e)}\n{traceback.format_exc()}"
|
||||
self.task_manager.fail_task(task_id, error_msg)
|
||||
|
||||
def create_graph(self, name: str) -> str:
|
||||
"""创建Zep图谱(公开方法)"""
|
||||
graph_id = f"mirofish_{uuid.uuid4().hex[:16]}"
|
||||
|
||||
self.client.graph.create(
|
||||
graph_id=graph_id,
|
||||
name=name,
|
||||
description="MiroFish Social Simulation Graph"
|
||||
)
|
||||
|
||||
return graph_id
|
||||
|
||||
def set_ontology(self, graph_id: str, ontology: Dict[str, Any]):
|
||||
"""设置图谱本体(公开方法)"""
|
||||
from typing import Optional
|
||||
from pydantic import Field
|
||||
from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel
|
||||
|
||||
# Zep 保留名称,不能作为属性名
|
||||
RESERVED_NAMES = {'uuid', 'name', 'group_id', 'name_embedding', 'summary', 'created_at'}
|
||||
|
||||
def safe_attr_name(attr_name: str) -> str:
|
||||
"""将保留名称转换为安全名称"""
|
||||
if attr_name.lower() in RESERVED_NAMES:
|
||||
return f"entity_{attr_name}"
|
||||
return attr_name
|
||||
|
||||
# 动态创建实体类型
|
||||
entity_types = {}
|
||||
for entity_def in ontology.get("entity_types", []):
|
||||
name = entity_def["name"]
|
||||
description = entity_def.get("description", f"A {name} entity.")
|
||||
|
||||
# 创建属性字典和类型注解(Pydantic v2 需要)
|
||||
attrs = {"__doc__": description}
|
||||
annotations = {}
|
||||
|
||||
for attr_def in entity_def.get("attributes", []):
|
||||
attr_name = safe_attr_name(attr_def["name"]) # 使用安全名称
|
||||
attr_desc = attr_def.get("description", attr_name)
|
||||
attrs[attr_name] = Field(description=attr_desc, default=None)
|
||||
annotations[attr_name] = Optional[EntityText] # 类型注解
|
||||
|
||||
attrs["__annotations__"] = annotations
|
||||
|
||||
# 动态创建类
|
||||
entity_class = type(name, (EntityModel,), attrs)
|
||||
entity_class.__doc__ = description
|
||||
entity_types[name] = entity_class
|
||||
|
||||
# 动态创建边类型
|
||||
edge_definitions = {}
|
||||
for edge_def in ontology.get("edge_types", []):
|
||||
name = edge_def["name"]
|
||||
description = edge_def.get("description", f"A {name} relationship.")
|
||||
|
||||
# 创建属性字典和类型注解
|
||||
attrs = {"__doc__": description}
|
||||
annotations = {}
|
||||
|
||||
for attr_def in edge_def.get("attributes", []):
|
||||
attr_name = safe_attr_name(attr_def["name"]) # 使用安全名称
|
||||
attr_desc = attr_def.get("description", attr_name)
|
||||
attrs[attr_name] = Field(description=attr_desc, default=None)
|
||||
annotations[attr_name] = Optional[str] # 边属性用str类型
|
||||
|
||||
attrs["__annotations__"] = annotations
|
||||
|
||||
# 动态创建类
|
||||
class_name = ''.join(word.capitalize() for word in name.split('_'))
|
||||
edge_class = type(class_name, (EdgeModel,), attrs)
|
||||
edge_class.__doc__ = description
|
||||
|
||||
# 构建source_targets
|
||||
source_targets = []
|
||||
for st in edge_def.get("source_targets", []):
|
||||
source_targets.append(
|
||||
EntityEdgeSourceTarget(
|
||||
source=st.get("source", "Entity"),
|
||||
target=st.get("target", "Entity")
|
||||
)
|
||||
)
|
||||
|
||||
if source_targets:
|
||||
edge_definitions[name] = (edge_class, source_targets)
|
||||
|
||||
# 调用Zep API设置本体
|
||||
if entity_types or edge_definitions:
|
||||
self.client.graph.set_ontology(
|
||||
graph_ids=[graph_id],
|
||||
entities=entity_types if entity_types else None,
|
||||
edges=edge_definitions if edge_definitions else None,
|
||||
)
|
||||
|
||||
def add_text_batches(
|
||||
self,
|
||||
graph_id: str,
|
||||
chunks: List[str],
|
||||
batch_size: int = 3,
|
||||
progress_callback: Optional[Callable] = None
|
||||
) -> List[str]:
|
||||
"""分批添加文本到图谱,返回所有 episode 的 uuid 列表"""
|
||||
episode_uuids = []
|
||||
total_chunks = len(chunks)
|
||||
|
||||
for i in range(0, total_chunks, batch_size):
|
||||
batch_chunks = chunks[i:i + batch_size]
|
||||
batch_num = i // batch_size + 1
|
||||
total_batches = (total_chunks + batch_size - 1) // batch_size
|
||||
|
||||
if progress_callback:
|
||||
progress = (i + len(batch_chunks)) / total_chunks
|
||||
progress_callback(
|
||||
f"发送第 {batch_num}/{total_batches} 批数据 ({len(batch_chunks)} 块)...",
|
||||
progress
|
||||
)
|
||||
|
||||
# 构建episode数据
|
||||
episodes = [
|
||||
EpisodeData(data=chunk, type="text")
|
||||
for chunk in batch_chunks
|
||||
]
|
||||
|
||||
# 发送到Zep
|
||||
try:
|
||||
batch_result = self.client.graph.add_batch(
|
||||
graph_id=graph_id,
|
||||
episodes=episodes
|
||||
)
|
||||
|
||||
# 收集返回的 episode uuid
|
||||
if batch_result and isinstance(batch_result, list):
|
||||
for ep in batch_result:
|
||||
ep_uuid = getattr(ep, 'uuid_', None) or getattr(ep, 'uuid', None)
|
||||
if ep_uuid:
|
||||
episode_uuids.append(ep_uuid)
|
||||
|
||||
# 避免请求过快
|
||||
time.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
if progress_callback:
|
||||
progress_callback(f"批次 {batch_num} 发送失败: {str(e)}", 0)
|
||||
raise
|
||||
|
||||
return episode_uuids
|
||||
|
||||
def _wait_for_episodes(
|
||||
self,
|
||||
episode_uuids: List[str],
|
||||
progress_callback: Optional[Callable] = None,
|
||||
timeout: int = 600
|
||||
):
|
||||
"""等待所有 episode 处理完成(通过查询每个 episode 的 processed 状态)"""
|
||||
if not episode_uuids:
|
||||
if progress_callback:
|
||||
progress_callback("无需等待(没有 episode)", 1.0)
|
||||
return
|
||||
|
||||
start_time = time.time()
|
||||
pending_episodes = set(episode_uuids)
|
||||
completed_count = 0
|
||||
total_episodes = len(episode_uuids)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"开始等待 {total_episodes} 个文本块处理...", 0)
|
||||
|
||||
while pending_episodes:
|
||||
if time.time() - start_time > timeout:
|
||||
if progress_callback:
|
||||
progress_callback(
|
||||
f"部分文本块超时,已完成 {completed_count}/{total_episodes}",
|
||||
completed_count / total_episodes
|
||||
)
|
||||
break
|
||||
|
||||
# 检查每个 episode 的处理状态
|
||||
for ep_uuid in list(pending_episodes):
|
||||
try:
|
||||
episode = self.client.graph.episode.get(uuid_=ep_uuid)
|
||||
is_processed = getattr(episode, 'processed', False)
|
||||
|
||||
if is_processed:
|
||||
pending_episodes.remove(ep_uuid)
|
||||
completed_count += 1
|
||||
|
||||
except Exception as e:
|
||||
# 忽略单个查询错误,继续
|
||||
pass
|
||||
|
||||
elapsed = int(time.time() - start_time)
|
||||
if progress_callback:
|
||||
progress_callback(
|
||||
f"Zep处理中... {completed_count}/{total_episodes} 完成, {len(pending_episodes)} 待处理 ({elapsed}秒)",
|
||||
completed_count / total_episodes if total_episodes > 0 else 0
|
||||
)
|
||||
|
||||
if pending_episodes:
|
||||
time.sleep(3) # 每3秒检查一次
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(f"处理完成: {completed_count}/{total_episodes}", 1.0)
|
||||
|
||||
def _get_graph_info(self, graph_id: str) -> GraphInfo:
|
||||
"""获取图谱信息"""
|
||||
# 获取节点
|
||||
nodes = self.client.graph.node.get_by_graph_id(graph_id=graph_id)
|
||||
|
||||
# 获取边
|
||||
edges = self.client.graph.edge.get_by_graph_id(graph_id=graph_id)
|
||||
|
||||
# 统计实体类型
|
||||
entity_types = set()
|
||||
for node in nodes:
|
||||
if node.labels:
|
||||
for label in node.labels:
|
||||
if label not in ["Entity", "Node"]:
|
||||
entity_types.add(label)
|
||||
|
||||
return GraphInfo(
|
||||
graph_id=graph_id,
|
||||
node_count=len(nodes),
|
||||
edge_count=len(edges),
|
||||
entity_types=list(entity_types)
|
||||
)
|
||||
|
||||
def get_graph_data(self, graph_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
获取完整图谱数据
|
||||
|
||||
Args:
|
||||
graph_id: 图谱ID
|
||||
|
||||
Returns:
|
||||
包含nodes和edges的字典
|
||||
"""
|
||||
nodes = self.client.graph.node.get_by_graph_id(graph_id=graph_id)
|
||||
edges = self.client.graph.edge.get_by_graph_id(graph_id=graph_id)
|
||||
|
||||
nodes_data = []
|
||||
for node in nodes:
|
||||
nodes_data.append({
|
||||
"uuid": node.uuid_,
|
||||
"name": node.name,
|
||||
"labels": node.labels or [],
|
||||
"summary": node.summary or "",
|
||||
"attributes": node.attributes or {},
|
||||
})
|
||||
|
||||
edges_data = []
|
||||
for edge in edges:
|
||||
edges_data.append({
|
||||
"uuid": edge.uuid_,
|
||||
"name": edge.name or "",
|
||||
"fact": edge.fact or "",
|
||||
"source_node_uuid": edge.source_node_uuid,
|
||||
"target_node_uuid": edge.target_node_uuid,
|
||||
"attributes": edge.attributes or {},
|
||||
})
|
||||
|
||||
return {
|
||||
"graph_id": graph_id,
|
||||
"nodes": nodes_data,
|
||||
"edges": edges_data,
|
||||
"node_count": len(nodes_data),
|
||||
"edge_count": len(edges_data),
|
||||
}
|
||||
|
||||
def delete_graph(self, graph_id: str):
|
||||
"""删除图谱"""
|
||||
self.client.graph.delete(graph_id=graph_id)
|
||||
|
||||
361
backend/app/services/ontology_generator.py
Normal file
361
backend/app/services/ontology_generator.py
Normal file
@@ -0,0 +1,361 @@
|
||||
"""
|
||||
本体生成服务
|
||||
接口1:分析文本内容,生成适合社会模拟的实体和关系类型定义
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Dict, Any, List, Optional
|
||||
from ..utils.llm_client import LLMClient
|
||||
|
||||
|
||||
# 本体生成的系统提示词
|
||||
ONTOLOGY_SYSTEM_PROMPT = """你是一个专业的知识图谱本体设计专家。你的任务是分析给定的文本内容和模拟需求,设计适合**社交媒体舆论模拟**的实体类型和关系类型。
|
||||
|
||||
**重要:你必须输出有效的JSON格式数据,不要输出任何其他内容。**
|
||||
|
||||
## 核心任务背景
|
||||
|
||||
我们正在构建一个**社交媒体舆论模拟系统**。在这个系统中:
|
||||
- 每个实体都是一个可以在社交媒体上发声、互动、传播信息的"账号"或"主体"
|
||||
- 实体之间会相互影响、转发、评论、回应
|
||||
- 我们需要模拟舆论事件中各方的反应和信息传播路径
|
||||
|
||||
因此,**实体必须是现实中真实存在的、可以在社媒上发声和互动的主体**:
|
||||
|
||||
**可以是(鼓励多样化划分)**:
|
||||
- 具体的个人(公众人物、当事人、意见领袖、专家学者)
|
||||
- 公司、企业(包括其官方账号)
|
||||
- 组织机构(大学、协会、NGO、工会等)
|
||||
- 政府部门、监管机构
|
||||
- 媒体机构(报纸、电视台、自媒体、网站)
|
||||
- 社交媒体平台本身
|
||||
- 特定群体代表(如校友会、粉丝团、维权群体等)
|
||||
|
||||
**不可以是**:
|
||||
- 抽象概念(如"舆论"、"情绪"、"趋势")
|
||||
- 主题/话题(如"学术诚信"、"教育改革")
|
||||
- 观点/态度(如"支持方"、"反对方")
|
||||
- 泛指群体(如"网友"、"公众"、"学生群体")
|
||||
|
||||
## 输出格式
|
||||
|
||||
请输出JSON格式,包含以下结构:
|
||||
|
||||
```json
|
||||
{
|
||||
"entity_types": [
|
||||
{
|
||||
"name": "实体类型名称(英文,PascalCase)",
|
||||
"description": "简短描述(英文,不超过100字符)",
|
||||
"attributes": [
|
||||
{
|
||||
"name": "属性名(英文,snake_case)",
|
||||
"type": "text",
|
||||
"description": "属性描述"
|
||||
}
|
||||
],
|
||||
"examples": ["示例实体1", "示例实体2"]
|
||||
}
|
||||
],
|
||||
"edge_types": [
|
||||
{
|
||||
"name": "关系类型名称(英文,UPPER_SNAKE_CASE)",
|
||||
"description": "简短描述(英文,不超过100字符)",
|
||||
"source_targets": [
|
||||
{"source": "源实体类型", "target": "目标实体类型"}
|
||||
],
|
||||
"attributes": []
|
||||
}
|
||||
],
|
||||
"analysis_summary": "对文本内容的简要分析说明(中文)"
|
||||
}
|
||||
```
|
||||
|
||||
## 设计指南
|
||||
|
||||
1. **实体类型设计(重要!请尽量多划分)**:
|
||||
- **数量要求:至少5个,最多10个实体类型**
|
||||
- 每个实体类型代表一类可以在社媒上发声的主体
|
||||
- 尽量细分不同角色,例如:
|
||||
- 不要只用"Person",可以细分为"PublicFigure"、"Expert"、"Whistleblower"等
|
||||
- 不要只用"Organization",可以细分为"University"、"Company"、"NGO"等
|
||||
- description必须清晰说明什么样的实体应该被提取
|
||||
- 每个类型提供2-3个具体示例
|
||||
|
||||
2. **关系类型设计**:
|
||||
- 关系应该反映社媒互动中的真实联系
|
||||
- 关注可能影响舆论传播的关系:
|
||||
- 信息传播:REPORTS_ON, COMMENTS_ON, SHARES
|
||||
- 组织关系:AFFILIATED_WITH, WORKS_FOR, REPRESENTS
|
||||
- 互动关系:RESPONDS_TO, SUPPORTS, OPPOSES
|
||||
- 关系类型:5-10个为宜
|
||||
|
||||
3. **属性设计**:
|
||||
- 每个实体类型1-3个关键属性
|
||||
- 属性应有助于识别实体的社媒影响力(如role、influence_level等)
|
||||
|
||||
## 实体类型参考(请根据文本内容灵活选择和扩展)
|
||||
|
||||
- Person: 普通个人
|
||||
- PublicFigure: 公众人物(明星、网红、意见领袖)
|
||||
- Expert: 专家学者
|
||||
- Journalist: 记者
|
||||
- Company: 公司企业
|
||||
- University: 高校
|
||||
- GovernmentAgency: 政府机构
|
||||
- MediaOutlet: 传统媒体
|
||||
- SelfMedia: 自媒体账号
|
||||
- SocialPlatform: 社交媒体平台
|
||||
- NGO: 非政府组织
|
||||
- IndustryAssociation: 行业协会
|
||||
- AlumniAssociation: 校友会
|
||||
- FanGroup: 粉丝群体/支持群体
|
||||
|
||||
## 关系类型参考
|
||||
|
||||
- WORKS_FOR: 工作于
|
||||
- AFFILIATED_WITH: 隶属于
|
||||
- REPRESENTS: 代表
|
||||
- REGULATES: 监管
|
||||
- REPORTS_ON: 报道
|
||||
- COMMENTS_ON: 评论
|
||||
- RESPONDS_TO: 回应
|
||||
- SUPPORTS: 支持
|
||||
- OPPOSES: 反对
|
||||
- COLLABORATES_WITH: 合作
|
||||
- COMPETES_WITH: 竞争
|
||||
"""
|
||||
|
||||
|
||||
class OntologyGenerator:
|
||||
"""
|
||||
本体生成器
|
||||
分析文本内容,生成实体和关系类型定义
|
||||
"""
|
||||
|
||||
def __init__(self, llm_client: Optional[LLMClient] = None):
|
||||
self.llm_client = llm_client or LLMClient()
|
||||
|
||||
def generate(
|
||||
self,
|
||||
document_texts: List[str],
|
||||
simulation_requirement: str,
|
||||
additional_context: Optional[str] = None
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
生成本体定义
|
||||
|
||||
Args:
|
||||
document_texts: 文档文本列表
|
||||
simulation_requirement: 模拟需求描述
|
||||
additional_context: 额外上下文
|
||||
|
||||
Returns:
|
||||
本体定义(entity_types, edge_types等)
|
||||
"""
|
||||
# 构建用户消息
|
||||
user_message = self._build_user_message(
|
||||
document_texts,
|
||||
simulation_requirement,
|
||||
additional_context
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": ONTOLOGY_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_message}
|
||||
]
|
||||
|
||||
# 调用LLM
|
||||
result = self.llm_client.chat_json(
|
||||
messages=messages,
|
||||
temperature=0.3,
|
||||
max_tokens=4096
|
||||
)
|
||||
|
||||
# 验证和后处理
|
||||
result = self._validate_and_process(result)
|
||||
|
||||
return result
|
||||
|
||||
# 传给 LLM 的文本最大长度(5万字)
|
||||
MAX_TEXT_LENGTH_FOR_LLM = 50000
|
||||
|
||||
def _build_user_message(
|
||||
self,
|
||||
document_texts: List[str],
|
||||
simulation_requirement: str,
|
||||
additional_context: Optional[str]
|
||||
) -> str:
|
||||
"""构建用户消息"""
|
||||
|
||||
# 合并文本
|
||||
combined_text = "\n\n---\n\n".join(document_texts)
|
||||
original_length = len(combined_text)
|
||||
|
||||
# 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建)
|
||||
if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM:
|
||||
combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM]
|
||||
combined_text += f"\n\n...(原文共{original_length}字,已截取前{self.MAX_TEXT_LENGTH_FOR_LLM}字用于本体分析)..."
|
||||
|
||||
message = f"""## 模拟需求
|
||||
|
||||
{simulation_requirement}
|
||||
|
||||
## 文档内容
|
||||
|
||||
{combined_text}
|
||||
"""
|
||||
|
||||
if additional_context:
|
||||
message += f"""
|
||||
## 额外说明
|
||||
|
||||
{additional_context}
|
||||
"""
|
||||
|
||||
message += """
|
||||
请根据以上内容,设计适合社会舆论模拟的实体类型和关系类型。
|
||||
记住:所有实体类型必须是现实中可以发声的主体,不能是抽象概念。
|
||||
"""
|
||||
|
||||
return message
|
||||
|
||||
def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""验证和后处理结果"""
|
||||
|
||||
# 确保必要字段存在
|
||||
if "entity_types" not in result:
|
||||
result["entity_types"] = []
|
||||
if "edge_types" not in result:
|
||||
result["edge_types"] = []
|
||||
if "analysis_summary" not in result:
|
||||
result["analysis_summary"] = ""
|
||||
|
||||
# 验证实体类型
|
||||
for entity in result["entity_types"]:
|
||||
if "attributes" not in entity:
|
||||
entity["attributes"] = []
|
||||
if "examples" not in entity:
|
||||
entity["examples"] = []
|
||||
# 确保description不超过100字符
|
||||
if len(entity.get("description", "")) > 100:
|
||||
entity["description"] = entity["description"][:97] + "..."
|
||||
|
||||
# 验证关系类型
|
||||
for edge in result["edge_types"]:
|
||||
if "source_targets" not in edge:
|
||||
edge["source_targets"] = []
|
||||
if "attributes" not in edge:
|
||||
edge["attributes"] = []
|
||||
if len(edge.get("description", "")) > 100:
|
||||
edge["description"] = edge["description"][:97] + "..."
|
||||
|
||||
return result
|
||||
|
||||
def generate_python_code(self, ontology: Dict[str, Any]) -> str:
|
||||
"""
|
||||
将本体定义转换为Python代码(类似ontology.py)
|
||||
|
||||
Args:
|
||||
ontology: 本体定义
|
||||
|
||||
Returns:
|
||||
Python代码字符串
|
||||
"""
|
||||
code_lines = [
|
||||
'"""',
|
||||
'自定义实体类型定义',
|
||||
'由MiroFish自动生成,用于社会舆论模拟',
|
||||
'"""',
|
||||
'',
|
||||
'from pydantic import Field',
|
||||
'from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel',
|
||||
'',
|
||||
'',
|
||||
'# ============== 实体类型定义 ==============',
|
||||
'',
|
||||
]
|
||||
|
||||
# 生成实体类型
|
||||
for entity in ontology.get("entity_types", []):
|
||||
name = entity["name"]
|
||||
desc = entity.get("description", f"A {name} entity.")
|
||||
|
||||
code_lines.append(f'class {name}(EntityModel):')
|
||||
code_lines.append(f' """{desc}"""')
|
||||
|
||||
attrs = entity.get("attributes", [])
|
||||
if attrs:
|
||||
for attr in attrs:
|
||||
attr_name = attr["name"]
|
||||
attr_desc = attr.get("description", attr_name)
|
||||
code_lines.append(f' {attr_name}: EntityText = Field(')
|
||||
code_lines.append(f' description="{attr_desc}",')
|
||||
code_lines.append(f' default=None')
|
||||
code_lines.append(f' )')
|
||||
else:
|
||||
code_lines.append(' pass')
|
||||
|
||||
code_lines.append('')
|
||||
code_lines.append('')
|
||||
|
||||
code_lines.append('# ============== 关系类型定义 ==============')
|
||||
code_lines.append('')
|
||||
|
||||
# 生成关系类型
|
||||
for edge in ontology.get("edge_types", []):
|
||||
name = edge["name"]
|
||||
# 转换为PascalCase类名
|
||||
class_name = ''.join(word.capitalize() for word in name.split('_'))
|
||||
desc = edge.get("description", f"A {name} relationship.")
|
||||
|
||||
code_lines.append(f'class {class_name}(EdgeModel):')
|
||||
code_lines.append(f' """{desc}"""')
|
||||
|
||||
attrs = edge.get("attributes", [])
|
||||
if attrs:
|
||||
for attr in attrs:
|
||||
attr_name = attr["name"]
|
||||
attr_desc = attr.get("description", attr_name)
|
||||
code_lines.append(f' {attr_name}: EntityText = Field(')
|
||||
code_lines.append(f' description="{attr_desc}",')
|
||||
code_lines.append(f' default=None')
|
||||
code_lines.append(f' )')
|
||||
else:
|
||||
code_lines.append(' pass')
|
||||
|
||||
code_lines.append('')
|
||||
code_lines.append('')
|
||||
|
||||
# 生成类型字典
|
||||
code_lines.append('# ============== 类型配置 ==============')
|
||||
code_lines.append('')
|
||||
code_lines.append('ENTITY_TYPES = {')
|
||||
for entity in ontology.get("entity_types", []):
|
||||
name = entity["name"]
|
||||
code_lines.append(f' "{name}": {name},')
|
||||
code_lines.append('}')
|
||||
code_lines.append('')
|
||||
code_lines.append('EDGE_TYPES = {')
|
||||
for edge in ontology.get("edge_types", []):
|
||||
name = edge["name"]
|
||||
class_name = ''.join(word.capitalize() for word in name.split('_'))
|
||||
code_lines.append(f' "{name}": {class_name},')
|
||||
code_lines.append('}')
|
||||
code_lines.append('')
|
||||
|
||||
# 生成边的source_targets映射
|
||||
code_lines.append('EDGE_SOURCE_TARGETS = {')
|
||||
for edge in ontology.get("edge_types", []):
|
||||
name = edge["name"]
|
||||
source_targets = edge.get("source_targets", [])
|
||||
if source_targets:
|
||||
st_list = ', '.join([
|
||||
f'{{"source": "{st.get("source", "Entity")}", "target": "{st.get("target", "Entity")}"}}'
|
||||
for st in source_targets
|
||||
])
|
||||
code_lines.append(f' "{name}": [{st_list}],')
|
||||
code_lines.append('}')
|
||||
|
||||
return '\n'.join(code_lines)
|
||||
|
||||
71
backend/app/services/text_processor.py
Normal file
71
backend/app/services/text_processor.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""
|
||||
文本处理服务
|
||||
"""
|
||||
|
||||
from typing import List, Optional
|
||||
from ..utils.file_parser import FileParser, split_text_into_chunks
|
||||
|
||||
|
||||
class TextProcessor:
|
||||
"""文本处理器"""
|
||||
|
||||
@staticmethod
|
||||
def extract_from_files(file_paths: List[str]) -> str:
|
||||
"""从多个文件提取文本"""
|
||||
return FileParser.extract_from_multiple(file_paths)
|
||||
|
||||
@staticmethod
|
||||
def split_text(
|
||||
text: str,
|
||||
chunk_size: int = 500,
|
||||
overlap: int = 50
|
||||
) -> List[str]:
|
||||
"""
|
||||
分割文本
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
chunk_size: 块大小
|
||||
overlap: 重叠大小
|
||||
|
||||
Returns:
|
||||
文本块列表
|
||||
"""
|
||||
return split_text_into_chunks(text, chunk_size, overlap)
|
||||
|
||||
@staticmethod
|
||||
def preprocess_text(text: str) -> str:
|
||||
"""
|
||||
预处理文本
|
||||
- 移除多余空白
|
||||
- 标准化换行
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
|
||||
Returns:
|
||||
处理后的文本
|
||||
"""
|
||||
import re
|
||||
|
||||
# 标准化换行
|
||||
text = text.replace('\r\n', '\n').replace('\r', '\n')
|
||||
|
||||
# 移除连续空行(保留最多两个换行)
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
# 移除行首行尾空白
|
||||
lines = [line.strip() for line in text.split('\n')]
|
||||
text = '\n'.join(lines)
|
||||
|
||||
return text.strip()
|
||||
|
||||
@staticmethod
|
||||
def get_text_stats(text: str) -> dict:
|
||||
"""获取文本统计信息"""
|
||||
return {
|
||||
"total_chars": len(text),
|
||||
"total_lines": text.count('\n') + 1,
|
||||
"total_words": len(text.split()),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user