Files
microfish/backend/app/services/ontology_generator.py
2026-06-17 15:32:47 +07:00

507 lines
19 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
本体生成服务
接口1分析文本内容生成适合社会模拟的实体和关系类型定义
"""
import json
import logging
import re
from typing import Dict, Any, List, Optional
from ..utils.llm_client import LLMClient
from ..utils.locale import get_language_instruction
logger = logging.getLogger(__name__)
def _to_pascal_case(name: str) -> str:
"""将任意格式的名称转换为 PascalCase'works_for' -> 'WorksFor', 'person' -> 'Person'"""
# 按非字母数字字符分割
parts = re.split(r'[^a-zA-Z0-9]+', name)
# 再按 camelCase 边界分割(如 'camelCase' -> ['camel', 'Case']
words = []
for part in parts:
words.extend(re.sub(r'([a-z])([A-Z])', r'\1_\2', part).split('_'))
# 每个词首字母大写,过滤空串
result = ''.join(word.capitalize() for word in words if word)
return result if result else 'Unknown'
# 本体生成的系统提示词
ONTOLOGY_SYSTEM_PROMPT = """You are a professional knowledge graph ontology design expert. Your task is to analyze the given text content and simulation requirements, and design entity types and relationship types suitable for **social media opinion simulation**.
**Important: You must output valid JSON format data only, do not output any other content.**
## Core Task Background
We are building a **social media opinion simulation system**. In this system:
- Each entity is an "account" or "subject" that can speak, interact, and spread information on social media
- Entities influence each other, repost, comment, and respond
- We need to simulate the reactions and information propagation paths of various parties in opinion events
Therefore, **entities must be real-world subjects that can speak and interact on social media**:
**Allowed**:
- Specific individuals (public figures, parties involved, opinion leaders, experts, ordinary people)
- Companies, enterprises (including their official accounts)
- Organizations (universities, associations, NGOs, unions, etc.)
- Government departments, regulatory agencies
- Media organizations (newspapers, TV stations, self-media, websites)
- Social media platforms themselves
- Specific group representatives (alumni associations, fan groups, advocacy groups, etc.)
**Not allowed**:
- Abstract concepts (such as "public opinion", "emotion", "trend")
- Topics/subjects (such as "academic integrity", "education reform")
- Viewpoints/attitudes (such as "supporters", "opponents")
## Output Format
Output in JSON format containing the following structure:
```json
{
"entity_types": [
{
"name": "Entity type name (English, PascalCase)",
"description": "Brief description (English, max 100 chars)",
"attributes": [
{
"name": "Attribute name (English, snake_case)",
"type": "text",
"description": "Attribute description"
}
],
"examples": ["Example entity 1", "Example entity 2"]
}
],
"edge_types": [
{
"name": "Relationship type name (English, UPPER_SNAKE_CASE)",
"description": "Brief description (English, max 100 chars)",
"source_targets": [
{"source": "Source entity type", "target": "Target entity type"}
],
"attributes": []
}
],
"analysis_summary": "Brief analysis of the text content"
}
```
## Design Guidelines (Extremely Important!)
### 1. Entity Type Design - Must Strictly Follow
**Quantity requirement: Exactly 10 entity types**
**Hierarchy requirement (must include both specific types and fallback types)**:
Your 10 entity types must include the following hierarchy:
A. **Fallback types (must include, place last 2 in the list)**:
- `Person`: Fallback type for any natural person. When a person doesn't match any more specific type, use this.
- `Organization`: Fallback type for any organization. When an organization doesn't match any more specific type, use this.
B. **Specific types (8, designed based on text content)**:
- Design more specific types for the main roles appearing in the text
- Example: If the text involves an academic event, you can have `Student`, `Professor`, `University`
- Example: If the text involves a business event, you can have `Company`, `CEO`, `Employee`
**Why fallback types are needed**:
- Various people appear in the text, such as "school teacher", "bystander", "an online user"
- If no specific type matches, they should fall into `Person`
- Similarly, small organizations, temporary groups should fall into `Organization`
**Design principles for specific types**:
- Identify high-frequency or key role types from the text
- Each specific type should have clear boundaries, avoid overlap
- Description must clearly explain the difference between this type and the fallback type
### 2. Relationship Type Design
- Quantity: 6-10
- Relationships should reflect real connections in social media interaction
- Ensure source_targets of relationships cover the entity types you defined
### 3. Attribute Design
- 1-3 key attributes per entity type
- **Note**: Attribute names cannot use `name`, `uuid`, `group_id`, `created_at`, `summary` (these are system reserved words)
- Recommended: `full_name`, `title`, `role`, `position`, `location`, `description`, etc.
## Entity Type Reference
**Individuals (Specific)**:
- Student: Student
- Professor: Professor/Scholar
- Journalist: Journalist
- Celebrity: Celebrity/Influencer
- Executive: Executive
- Official: Government official
- Lawyer: Lawyer
- Doctor: Doctor
**Individuals (Fallback)**:
- Person: Any natural person (used when not matching specific types above)
**Organizations (Specific)**:
- University: University
- Company: Company/Enterprise
- GovernmentAgency: Government agency
- MediaOutlet: Media organization
- Hospital: Hospital
- School: School
- NGO: Non-governmental organization
**Organizations (Fallback)**:
- Organization: Any organization (used when not matching specific types above)
## Relationship Type Reference
- WORKS_FOR: Works at
- STUDIES_AT: Studies at
- AFFILIATED_WITH: Affiliated with
- REPRESENTS: Represents
- REGULATES: Regulates
- REPORTS_ON: Reports on
- COMMENTS_ON: Comments on
- RESPONDS_TO: Responds to
- SUPPORTS: Supports
- OPPOSES: Opposes
- COLLABORATES_WITH: Collaborates with
- COMPETES_WITH: Competes with
"""
class OntologyGenerator:
"""
本体生成器
分析文本内容,生成实体和关系类型定义
"""
def __init__(self, llm_client: Optional[LLMClient] = None):
self.llm_client = llm_client or LLMClient()
def generate(
self,
document_texts: List[str],
simulation_requirement: str,
additional_context: Optional[str] = None
) -> Dict[str, Any]:
"""
生成本体定义
Args:
document_texts: 文档文本列表
simulation_requirement: 模拟需求描述
additional_context: 额外上下文
Returns:
本体定义entity_types, edge_types等
"""
# 构建用户消息
user_message = self._build_user_message(
document_texts,
simulation_requirement,
additional_context
)
lang_instruction = get_language_instruction()
system_prompt = f"{ONTOLOGY_SYSTEM_PROMPT}\n\n{lang_instruction}\nIMPORTANT: Entity type names MUST be in English PascalCase (e.g., 'PersonEntity', 'MediaOrganization'). Relationship type names MUST be in English UPPER_SNAKE_CASE (e.g., 'WORKS_FOR'). Attribute names MUST be in English snake_case. Only description fields and analysis_summary should use the specified language above."
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message}
]
# 调用LLM
result = self.llm_client.chat_json(
messages=messages,
temperature=0.3,
max_tokens=4096
)
# 验证和后处理
result = self._validate_and_process(result)
return result
# 传给 LLM 的文本最大长度5万字
MAX_TEXT_LENGTH_FOR_LLM = 50000
def _build_user_message(
self,
document_texts: List[str],
simulation_requirement: str,
additional_context: Optional[str]
) -> str:
"""构建用户消息"""
# 合并文本
combined_text = "\n\n---\n\n".join(document_texts)
original_length = len(combined_text)
# 如果文本超过5万字截断仅影响传给LLM的内容不影响图谱构建
if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM:
combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM]
combined_text += f"\n\n...(原文共{original_length}字,已截取前{self.MAX_TEXT_LENGTH_FOR_LLM}字用于本体分析)..."
message = f"""## 模拟需求
{simulation_requirement}
## 文档内容
{combined_text}
"""
if additional_context:
message += f"""
## 额外说明
{additional_context}
"""
message += """
请根据以上内容,设计适合社会舆论模拟的实体类型和关系类型。
**必须遵守的规则**
1. 必须正好输出10个实体类型
2. 最后2个必须是兜底类型Person个人兜底和 Organization组织兜底
3. 前8个是根据文本内容设计的具体类型
4. 所有实体类型必须是现实中可以发声的主体,不能是抽象概念
5. 属性名不能使用 name、uuid、group_id 等保留字,用 full_name、org_name 等替代
"""
return message
def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]:
"""验证和后处理结果"""
# 确保必要字段存在
if "entity_types" not in result:
result["entity_types"] = []
if "edge_types" not in result:
result["edge_types"] = []
if "analysis_summary" not in result:
result["analysis_summary"] = ""
# 验证实体类型
# 记录原始名称到 PascalCase 的映射,用于后续修正 edge 的 source_targets 引用
entity_name_map = {}
for entity in result["entity_types"]:
# 强制将 entity name 转为 PascalCaseZep API 要求)
if "name" in entity:
original_name = entity["name"]
entity["name"] = _to_pascal_case(original_name)
if entity["name"] != original_name:
logger.warning(f"Entity type name '{original_name}' auto-converted to '{entity['name']}'")
entity_name_map[original_name] = entity["name"]
if "attributes" not in entity:
entity["attributes"] = []
if "examples" not in entity:
entity["examples"] = []
# 确保description不超过100字符
if len(entity.get("description", "")) > 100:
entity["description"] = entity["description"][:97] + "..."
# 验证关系类型
for edge in result["edge_types"]:
# 强制将 edge name 转为 SCREAMING_SNAKE_CASEZep API 要求)
if "name" in edge:
original_name = edge["name"]
edge["name"] = original_name.upper()
if edge["name"] != original_name:
logger.warning(f"Edge type name '{original_name}' auto-converted to '{edge['name']}'")
# 修正 source_targets 中的实体名称引用,与转换后的 PascalCase 保持一致
for st in edge.get("source_targets", []):
if st.get("source") in entity_name_map:
st["source"] = entity_name_map[st["source"]]
if st.get("target") in entity_name_map:
st["target"] = entity_name_map[st["target"]]
if "source_targets" not in edge:
edge["source_targets"] = []
if "attributes" not in edge:
edge["attributes"] = []
if len(edge.get("description", "")) > 100:
edge["description"] = edge["description"][:97] + "..."
# Zep API 限制:最多 10 个自定义实体类型,最多 10 个自定义边类型
MAX_ENTITY_TYPES = 10
MAX_EDGE_TYPES = 10
# 去重:按 name 去重,保留首次出现的
seen_names = set()
deduped = []
for entity in result["entity_types"]:
name = entity.get("name", "")
if name and name not in seen_names:
seen_names.add(name)
deduped.append(entity)
elif name in seen_names:
logger.warning(f"Duplicate entity type '{name}' removed during validation")
result["entity_types"] = deduped
# 兜底类型定义
person_fallback = {
"name": "Person",
"description": "Any individual person not fitting other specific person types.",
"attributes": [
{"name": "full_name", "type": "text", "description": "Full name of the person"},
{"name": "role", "type": "text", "description": "Role or occupation"}
],
"examples": ["ordinary citizen", "anonymous netizen"]
}
organization_fallback = {
"name": "Organization",
"description": "Any organization not fitting other specific organization types.",
"attributes": [
{"name": "org_name", "type": "text", "description": "Name of the organization"},
{"name": "org_type", "type": "text", "description": "Type of organization"}
],
"examples": ["small business", "community group"]
}
# 检查是否已有兜底类型
entity_names = {e["name"] for e in result["entity_types"]}
has_person = "Person" in entity_names
has_organization = "Organization" in entity_names
# 需要添加的兜底类型
fallbacks_to_add = []
if not has_person:
fallbacks_to_add.append(person_fallback)
if not has_organization:
fallbacks_to_add.append(organization_fallback)
if fallbacks_to_add:
current_count = len(result["entity_types"])
needed_slots = len(fallbacks_to_add)
# 如果添加后会超过 10 个,需要移除一些现有类型
if current_count + needed_slots > MAX_ENTITY_TYPES:
# 计算需要移除多少个
to_remove = current_count + needed_slots - MAX_ENTITY_TYPES
# 从末尾移除(保留前面更重要的具体类型)
result["entity_types"] = result["entity_types"][:-to_remove]
# 添加兜底类型
result["entity_types"].extend(fallbacks_to_add)
# 最终确保不超过限制(防御性编程)
if len(result["entity_types"]) > MAX_ENTITY_TYPES:
result["entity_types"] = result["entity_types"][:MAX_ENTITY_TYPES]
if len(result["edge_types"]) > MAX_EDGE_TYPES:
result["edge_types"] = result["edge_types"][:MAX_EDGE_TYPES]
return result
def generate_python_code(self, ontology: Dict[str, Any]) -> str:
"""
将本体定义转换为Python代码类似ontology.py
Args:
ontology: 本体定义
Returns:
Python代码字符串
"""
code_lines = [
'"""',
'自定义实体类型定义',
'由MiroFish自动生成用于社会舆论模拟',
'"""',
'',
'from pydantic import Field',
'from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel',
'',
'',
'# ============== 实体类型定义 ==============',
'',
]
# 生成实体类型
for entity in ontology.get("entity_types", []):
name = entity["name"]
desc = entity.get("description", f"A {name} entity.")
code_lines.append(f'class {name}(EntityModel):')
code_lines.append(f' """{desc}"""')
attrs = entity.get("attributes", [])
if attrs:
for attr in attrs:
attr_name = attr["name"]
attr_desc = attr.get("description", attr_name)
code_lines.append(f' {attr_name}: EntityText = Field(')
code_lines.append(f' description="{attr_desc}",')
code_lines.append(f' default=None')
code_lines.append(f' )')
else:
code_lines.append(' pass')
code_lines.append('')
code_lines.append('')
code_lines.append('# ============== 关系类型定义 ==============')
code_lines.append('')
# 生成关系类型
for edge in ontology.get("edge_types", []):
name = edge["name"]
# 转换为PascalCase类名
class_name = ''.join(word.capitalize() for word in name.split('_'))
desc = edge.get("description", f"A {name} relationship.")
code_lines.append(f'class {class_name}(EdgeModel):')
code_lines.append(f' """{desc}"""')
attrs = edge.get("attributes", [])
if attrs:
for attr in attrs:
attr_name = attr["name"]
attr_desc = attr.get("description", attr_name)
code_lines.append(f' {attr_name}: EntityText = Field(')
code_lines.append(f' description="{attr_desc}",')
code_lines.append(f' default=None')
code_lines.append(f' )')
else:
code_lines.append(' pass')
code_lines.append('')
code_lines.append('')
# 生成类型字典
code_lines.append('# ============== 类型配置 ==============')
code_lines.append('')
code_lines.append('ENTITY_TYPES = {')
for entity in ontology.get("entity_types", []):
name = entity["name"]
code_lines.append(f' "{name}": {name},')
code_lines.append('}')
code_lines.append('')
code_lines.append('EDGE_TYPES = {')
for edge in ontology.get("edge_types", []):
name = edge["name"]
class_name = ''.join(word.capitalize() for word in name.split('_'))
code_lines.append(f' "{name}": {class_name},')
code_lines.append('}')
code_lines.append('')
# 生成边的source_targets映射
code_lines.append('EDGE_SOURCE_TARGETS = {')
for edge in ontology.get("edge_types", []):
name = edge["name"]
source_targets = edge.get("source_targets", [])
if source_targets:
st_list = ', '.join([
f'{{"source": "{st.get("source", "Entity")}", "target": "{st.get("target", "Entity")}"}}'
for st in source_targets
])
code_lines.append(f' "{name}": [{st_list}],')
code_lines.append('}')
return '\n'.join(code_lines)