Introduce Project ID for context management, finalizing the stateful API pipeline from file submission to graph construction.

2025-11-28 17:21:08 +08:00
parent 9657061b26
commit 08f417f3b7
20 changed files with 2850 additions and 1 deletions
--- a/backend/app/services/init.py
+++ b/backend/app/services/init.py
@@ -0,0 +1,10 @@
+"""
+业务服务模块
+"""
+
+from .ontology_generator import OntologyGenerator
+from .graph_builder import GraphBuilderService
+from .text_processor import TextProcessor
+
+__all__ = ['OntologyGenerator', 'GraphBuilderService', 'TextProcessor']
+
--- a/backend/app/services/graph_builder.py
+++ b/backend/app/services/graph_builder.py
@@ -0,0 +1,457 @@
+"""
+图谱构建服务
+接口2：使用Zep API构建Standalone Graph
+"""
+
+import os
+import uuid
+import time
+import threading
+from typing import Dict, Any, List, Optional, Callable
+from dataclasses import dataclass
+
+from zep_cloud.client import Zep
+from zep_cloud import EpisodeData, EntityEdgeSourceTarget
+
+from ..config import Config
+from ..models.task import TaskManager, TaskStatus
+from .text_processor import TextProcessor
+
+
+@dataclass
+class GraphInfo:
+    """图谱信息"""
+    graph_id: str
+    node_count: int
+    edge_count: int
+    entity_types: List[str]
+    
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "graph_id": self.graph_id,
+            "node_count": self.node_count,
+            "edge_count": self.edge_count,
+            "entity_types": self.entity_types,
+        }
+
+
+class GraphBuilderService:
+    """
+    图谱构建服务
+    负责调用Zep API构建知识图谱
+    """
+    
+    def __init__(self, api_key: Optional[str] = None):
+        self.api_key = api_key or Config.ZEP_API_KEY
+        if not self.api_key:
+            raise ValueError("ZEP_API_KEY 未配置")
+        
+        self.client = Zep(api_key=self.api_key)
+        self.task_manager = TaskManager()
+    
+    def build_graph_async(
+        self,
+        text: str,
+        ontology: Dict[str, Any],
+        graph_name: str = "MiroFish Graph",
+        chunk_size: int = 500,
+        chunk_overlap: int = 50,
+        batch_size: int = 3
+    ) -> str:
+        """
+        异步构建图谱
+        
+        Args:
+            text: 输入文本
+            ontology: 本体定义（来自接口1的输出）
+            graph_name: 图谱名称
+            chunk_size: 文本块大小
+            chunk_overlap: 块重叠大小
+            batch_size: 每批发送的块数量
+            
+        Returns:
+            任务ID
+        """
+        # 创建任务
+        task_id = self.task_manager.create_task(
+            task_type="graph_build",
+            metadata={
+                "graph_name": graph_name,
+                "chunk_size": chunk_size,
+                "text_length": len(text),
+            }
+        )
+        
+        # 在后台线程中执行构建
+        thread = threading.Thread(
+            target=self._build_graph_worker,
+            args=(task_id, text, ontology, graph_name, chunk_size, chunk_overlap, batch_size)
+        )
+        thread.daemon = True
+        thread.start()
+        
+        return task_id
+    
+    def _build_graph_worker(
+        self,
+        task_id: str,
+        text: str,
+        ontology: Dict[str, Any],
+        graph_name: str,
+        chunk_size: int,
+        chunk_overlap: int,
+        batch_size: int
+    ):
+        """图谱构建工作线程"""
+        try:
+            self.task_manager.update_task(
+                task_id,
+                status=TaskStatus.PROCESSING,
+                progress=5,
+                message="开始构建图谱..."
+            )
+            
+            # 1. 创建图谱
+            graph_id = self.create_graph(graph_name)
+            self.task_manager.update_task(
+                task_id,
+                progress=10,
+                message=f"图谱已创建: {graph_id}"
+            )
+            
+            # 2. 设置本体
+            self.set_ontology(graph_id, ontology)
+            self.task_manager.update_task(
+                task_id,
+                progress=15,
+                message="本体已设置"
+            )
+            
+            # 3. 文本分块
+            chunks = TextProcessor.split_text(text, chunk_size, chunk_overlap)
+            total_chunks = len(chunks)
+            self.task_manager.update_task(
+                task_id,
+                progress=20,
+                message=f"文本已分割为 {total_chunks} 个块"
+            )
+            
+            # 4. 分批发送数据
+            episode_uuids = self.add_text_batches(
+                graph_id, chunks, batch_size,
+                lambda msg, prog: self.task_manager.update_task(
+                    task_id,
+                    progress=20 + int(prog * 0.4),  # 20-60%
+                    message=msg
+                )
+            )
+            
+            # 5. 等待Zep处理完成
+            self.task_manager.update_task(
+                task_id,
+                progress=60,
+                message="等待Zep处理数据..."
+            )
+            
+            self._wait_for_episodes(
+                episode_uuids,
+                lambda msg, prog: self.task_manager.update_task(
+                    task_id,
+                    progress=60 + int(prog * 0.3),  # 60-90%
+                    message=msg
+                )
+            )
+            
+            # 6. 获取图谱信息
+            self.task_manager.update_task(
+                task_id,
+                progress=90,
+                message="获取图谱信息..."
+            )
+            
+            graph_info = self._get_graph_info(graph_id)
+            
+            # 完成
+            self.task_manager.complete_task(task_id, {
+                "graph_id": graph_id,
+                "graph_info": graph_info.to_dict(),
+                "chunks_processed": total_chunks,
+            })
+            
+        except Exception as e:
+            import traceback
+            error_msg = f"{str(e)}\n{traceback.format_exc()}"
+            self.task_manager.fail_task(task_id, error_msg)
+    
+    def create_graph(self, name: str) -> str:
+        """创建Zep图谱（公开方法）"""
+        graph_id = f"mirofish_{uuid.uuid4().hex[:16]}"
+        
+        self.client.graph.create(
+            graph_id=graph_id,
+            name=name,
+            description="MiroFish Social Simulation Graph"
+        )
+        
+        return graph_id
+    
+    def set_ontology(self, graph_id: str, ontology: Dict[str, Any]):
+        """设置图谱本体（公开方法）"""
+        from typing import Optional
+        from pydantic import Field
+        from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel
+        
+        # Zep 保留名称，不能作为属性名
+        RESERVED_NAMES = {'uuid', 'name', 'group_id', 'name_embedding', 'summary', 'created_at'}
+        
+        def safe_attr_name(attr_name: str) -> str:
+            """将保留名称转换为安全名称"""
+            if attr_name.lower() in RESERVED_NAMES:
+                return f"entity_{attr_name}"
+            return attr_name
+        
+        # 动态创建实体类型
+        entity_types = {}
+        for entity_def in ontology.get("entity_types", []):
+            name = entity_def["name"]
+            description = entity_def.get("description", f"A {name} entity.")
+            
+            # 创建属性字典和类型注解（Pydantic v2 需要）
+            attrs = {"__doc__": description}
+            annotations = {}
+            
+            for attr_def in entity_def.get("attributes", []):
+                attr_name = safe_attr_name(attr_def["name"])  # 使用安全名称
+                attr_desc = attr_def.get("description", attr_name)
+                attrs[attr_name] = Field(description=attr_desc, default=None)
+                annotations[attr_name] = Optional[EntityText]  # 类型注解
+            
+            attrs["__annotations__"] = annotations
+            
+            # 动态创建类
+            entity_class = type(name, (EntityModel,), attrs)
+            entity_class.__doc__ = description
+            entity_types[name] = entity_class
+        
+        # 动态创建边类型
+        edge_definitions = {}
+        for edge_def in ontology.get("edge_types", []):
+            name = edge_def["name"]
+            description = edge_def.get("description", f"A {name} relationship.")
+            
+            # 创建属性字典和类型注解
+            attrs = {"__doc__": description}
+            annotations = {}
+            
+            for attr_def in edge_def.get("attributes", []):
+                attr_name = safe_attr_name(attr_def["name"])  # 使用安全名称
+                attr_desc = attr_def.get("description", attr_name)
+                attrs[attr_name] = Field(description=attr_desc, default=None)
+                annotations[attr_name] = Optional[str]  # 边属性用str类型
+            
+            attrs["__annotations__"] = annotations
+            
+            # 动态创建类
+            class_name = ''.join(word.capitalize() for word in name.split('_'))
+            edge_class = type(class_name, (EdgeModel,), attrs)
+            edge_class.__doc__ = description
+            
+            # 构建source_targets
+            source_targets = []
+            for st in edge_def.get("source_targets", []):
+                source_targets.append(
+                    EntityEdgeSourceTarget(
+                        source=st.get("source", "Entity"),
+                        target=st.get("target", "Entity")
+                    )
+                )
+            
+            if source_targets:
+                edge_definitions[name] = (edge_class, source_targets)
+        
+        # 调用Zep API设置本体
+        if entity_types or edge_definitions:
+            self.client.graph.set_ontology(
+                graph_ids=[graph_id],
+                entities=entity_types if entity_types else None,
+                edges=edge_definitions if edge_definitions else None,
+            )
+    
+    def add_text_batches(
+        self,
+        graph_id: str,
+        chunks: List[str],
+        batch_size: int = 3,
+        progress_callback: Optional[Callable] = None
+    ) -> List[str]:
+        """分批添加文本到图谱，返回所有 episode 的 uuid 列表"""
+        episode_uuids = []
+        total_chunks = len(chunks)
+        
+        for i in range(0, total_chunks, batch_size):
+            batch_chunks = chunks[i:i + batch_size]
+            batch_num = i // batch_size + 1
+            total_batches = (total_chunks + batch_size - 1) // batch_size
+            
+            if progress_callback:
+                progress = (i + len(batch_chunks)) / total_chunks
+                progress_callback(
+                    f"发送第 {batch_num}/{total_batches} 批数据 ({len(batch_chunks)} 块)...",
+                    progress
+                )
+            
+            # 构建episode数据
+            episodes = [
+                EpisodeData(data=chunk, type="text")
+                for chunk in batch_chunks
+            ]
+            
+            # 发送到Zep
+            try:
+                batch_result = self.client.graph.add_batch(
+                    graph_id=graph_id,
+                    episodes=episodes
+                )
+                
+                # 收集返回的 episode uuid
+                if batch_result and isinstance(batch_result, list):
+                    for ep in batch_result:
+                        ep_uuid = getattr(ep, 'uuid_', None) or getattr(ep, 'uuid', None)
+                        if ep_uuid:
+                            episode_uuids.append(ep_uuid)
+                
+                # 避免请求过快
+                time.sleep(1)
+                
+            except Exception as e:
+                if progress_callback:
+                    progress_callback(f"批次 {batch_num} 发送失败: {str(e)}", 0)
+                raise
+        
+        return episode_uuids
+    
+    def _wait_for_episodes(
+        self,
+        episode_uuids: List[str],
+        progress_callback: Optional[Callable] = None,
+        timeout: int = 600
+    ):
+        """等待所有 episode 处理完成（通过查询每个 episode 的 processed 状态）"""
+        if not episode_uuids:
+            if progress_callback:
+                progress_callback("无需等待（没有 episode）", 1.0)
+            return
+        
+        start_time = time.time()
+        pending_episodes = set(episode_uuids)
+        completed_count = 0
+        total_episodes = len(episode_uuids)
+        
+        if progress_callback:
+            progress_callback(f"开始等待 {total_episodes} 个文本块处理...", 0)
+        
+        while pending_episodes:
+            if time.time() - start_time > timeout:
+                if progress_callback:
+                    progress_callback(
+                        f"部分文本块超时，已完成 {completed_count}/{total_episodes}",
+                        completed_count / total_episodes
+                    )
+                break
+            
+            # 检查每个 episode 的处理状态
+            for ep_uuid in list(pending_episodes):
+                try:
+                    episode = self.client.graph.episode.get(uuid_=ep_uuid)
+                    is_processed = getattr(episode, 'processed', False)
+                    
+                    if is_processed:
+                        pending_episodes.remove(ep_uuid)
+                        completed_count += 1
+                        
+                except Exception as e:
+                    # 忽略单个查询错误，继续
+                    pass
+            
+            elapsed = int(time.time() - start_time)
+            if progress_callback:
+                progress_callback(
+                    f"Zep处理中... {completed_count}/{total_episodes} 完成, {len(pending_episodes)} 待处理 ({elapsed}秒)",
+                    completed_count / total_episodes if total_episodes > 0 else 0
+                )
+            
+            if pending_episodes:
+                time.sleep(3)  # 每3秒检查一次
+        
+        if progress_callback:
+            progress_callback(f"处理完成: {completed_count}/{total_episodes}", 1.0)
+    
+    def _get_graph_info(self, graph_id: str) -> GraphInfo:
+        """获取图谱信息"""
+        # 获取节点
+        nodes = self.client.graph.node.get_by_graph_id(graph_id=graph_id)
+        
+        # 获取边
+        edges = self.client.graph.edge.get_by_graph_id(graph_id=graph_id)
+        
+        # 统计实体类型
+        entity_types = set()
+        for node in nodes:
+            if node.labels:
+                for label in node.labels:
+                    if label not in ["Entity", "Node"]:
+                        entity_types.add(label)
+        
+        return GraphInfo(
+            graph_id=graph_id,
+            node_count=len(nodes),
+            edge_count=len(edges),
+            entity_types=list(entity_types)
+        )
+    
+    def get_graph_data(self, graph_id: str) -> Dict[str, Any]:
+        """
+        获取完整图谱数据
+        
+        Args:
+            graph_id: 图谱ID
+            
+        Returns:
+            包含nodes和edges的字典
+        """
+        nodes = self.client.graph.node.get_by_graph_id(graph_id=graph_id)
+        edges = self.client.graph.edge.get_by_graph_id(graph_id=graph_id)
+        
+        nodes_data = []
+        for node in nodes:
+            nodes_data.append({
+                "uuid": node.uuid_,
+                "name": node.name,
+                "labels": node.labels or [],
+                "summary": node.summary or "",
+                "attributes": node.attributes or {},
+            })
+        
+        edges_data = []
+        for edge in edges:
+            edges_data.append({
+                "uuid": edge.uuid_,
+                "name": edge.name or "",
+                "fact": edge.fact or "",
+                "source_node_uuid": edge.source_node_uuid,
+                "target_node_uuid": edge.target_node_uuid,
+                "attributes": edge.attributes or {},
+            })
+        
+        return {
+            "graph_id": graph_id,
+            "nodes": nodes_data,
+            "edges": edges_data,
+            "node_count": len(nodes_data),
+            "edge_count": len(edges_data),
+        }
+    
+    def delete_graph(self, graph_id: str):
+        """删除图谱"""
+        self.client.graph.delete(graph_id=graph_id)
+
--- a/backend/app/services/ontology_generator.py
+++ b/backend/app/services/ontology_generator.py
@@ -0,0 +1,361 @@
+"""
+本体生成服务
+接口1：分析文本内容，生成适合社会模拟的实体和关系类型定义
+"""
+
+import json
+from typing import Dict, Any, List, Optional
+from ..utils.llm_client import LLMClient
+
+
+# 本体生成的系统提示词
+ONTOLOGY_SYSTEM_PROMPT = """你是一个专业的知识图谱本体设计专家。你的任务是分析给定的文本内容和模拟需求，设计适合**社交媒体舆论模拟**的实体类型和关系类型。
+
+**重要：你必须输出有效的JSON格式数据，不要输出任何其他内容。**
+
+## 核心任务背景
+
+我们正在构建一个**社交媒体舆论模拟系统**。在这个系统中：
+- 每个实体都是一个可以在社交媒体上发声、互动、传播信息的"账号"或"主体"
+- 实体之间会相互影响、转发、评论、回应
+- 我们需要模拟舆论事件中各方的反应和信息传播路径
+
+因此，**实体必须是现实中真实存在的、可以在社媒上发声和互动的主体**：
+
+**可以是（鼓励多样化划分）**：
+- 具体的个人（公众人物、当事人、意见领袖、专家学者）
+- 公司、企业（包括其官方账号）
+- 组织机构（大学、协会、NGO、工会等）
+- 政府部门、监管机构
+- 媒体机构（报纸、电视台、自媒体、网站）
+- 社交媒体平台本身
+- 特定群体代表（如校友会、粉丝团、维权群体等）
+
+**不可以是**：
+- 抽象概念（如"舆论"、"情绪"、"趋势"）
+- 主题/话题（如"学术诚信"、"教育改革"）
+- 观点/态度（如"支持方"、"反对方"）
+- 泛指群体（如"网友"、"公众"、"学生群体"）
+
+## 输出格式
+
+请输出JSON格式，包含以下结构：
+
+```json
+{
+    "entity_types": [
+        {
+            "name": "实体类型名称（英文，PascalCase）",
+            "description": "简短描述（英文，不超过100字符）",
+            "attributes": [
+                {
+                    "name": "属性名（英文，snake_case）",
+                    "type": "text",
+                    "description": "属性描述"
+                }
+            ],
+            "examples": ["示例实体1", "示例实体2"]
+        }
+    ],
+    "edge_types": [
+        {
+            "name": "关系类型名称（英文，UPPER_SNAKE_CASE）",
+            "description": "简短描述（英文，不超过100字符）",
+            "source_targets": [
+                {"source": "源实体类型", "target": "目标实体类型"}
+            ],
+            "attributes": []
+        }
+    ],
+    "analysis_summary": "对文本内容的简要分析说明（中文）"
+}
+```
+
+## 设计指南
+
+1. **实体类型设计（重要！请尽量多划分）**：
+   - **数量要求：至少5个，最多10个实体类型**
+   - 每个实体类型代表一类可以在社媒上发声的主体
+   - 尽量细分不同角色，例如：
+     - 不要只用"Person"，可以细分为"PublicFigure"、"Expert"、"Whistleblower"等
+     - 不要只用"Organization"，可以细分为"University"、"Company"、"NGO"等
+   - description必须清晰说明什么样的实体应该被提取
+   - 每个类型提供2-3个具体示例
+
+2. **关系类型设计**：
+   - 关系应该反映社媒互动中的真实联系
+   - 关注可能影响舆论传播的关系：
+     - 信息传播：REPORTS_ON, COMMENTS_ON, SHARES
+     - 组织关系：AFFILIATED_WITH, WORKS_FOR, REPRESENTS
+     - 互动关系：RESPONDS_TO, SUPPORTS, OPPOSES
+   - 关系类型：5-10个为宜
+
+3. **属性设计**：
+   - 每个实体类型1-3个关键属性
+   - 属性应有助于识别实体的社媒影响力（如role、influence_level等）
+
+## 实体类型参考（请根据文本内容灵活选择和扩展）
+
+- Person: 普通个人
+- PublicFigure: 公众人物（明星、网红、意见领袖）
+- Expert: 专家学者
+- Journalist: 记者
+- Company: 公司企业
+- University: 高校
+- GovernmentAgency: 政府机构
+- MediaOutlet: 传统媒体
+- SelfMedia: 自媒体账号
+- SocialPlatform: 社交媒体平台
+- NGO: 非政府组织
+- IndustryAssociation: 行业协会
+- AlumniAssociation: 校友会
+- FanGroup: 粉丝群体/支持群体
+
+## 关系类型参考
+
+- WORKS_FOR: 工作于
+- AFFILIATED_WITH: 隶属于
+- REPRESENTS: 代表
+- REGULATES: 监管
+- REPORTS_ON: 报道
+- COMMENTS_ON: 评论
+- RESPONDS_TO: 回应
+- SUPPORTS: 支持
+- OPPOSES: 反对
+- COLLABORATES_WITH: 合作
+- COMPETES_WITH: 竞争
+"""
+
+
+class OntologyGenerator:
+    """
+    本体生成器
+    分析文本内容，生成实体和关系类型定义
+    """
+    
+    def __init__(self, llm_client: Optional[LLMClient] = None):
+        self.llm_client = llm_client or LLMClient()
+    
+    def generate(
+        self,
+        document_texts: List[str],
+        simulation_requirement: str,
+        additional_context: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        生成本体定义
+        
+        Args:
+            document_texts: 文档文本列表
+            simulation_requirement: 模拟需求描述
+            additional_context: 额外上下文
+            
+        Returns:
+            本体定义（entity_types, edge_types等）
+        """
+        # 构建用户消息
+        user_message = self._build_user_message(
+            document_texts, 
+            simulation_requirement,
+            additional_context
+        )
+        
+        messages = [
+            {"role": "system", "content": ONTOLOGY_SYSTEM_PROMPT},
+            {"role": "user", "content": user_message}
+        ]
+        
+        # 调用LLM
+        result = self.llm_client.chat_json(
+            messages=messages,
+            temperature=0.3,
+            max_tokens=4096
+        )
+        
+        # 验证和后处理
+        result = self._validate_and_process(result)
+        
+        return result
+    
+    # 传给 LLM 的文本最大长度（5万字）
+    MAX_TEXT_LENGTH_FOR_LLM = 50000
+    
+    def _build_user_message(
+        self,
+        document_texts: List[str],
+        simulation_requirement: str,
+        additional_context: Optional[str]
+    ) -> str:
+        """构建用户消息"""
+        
+        # 合并文本
+        combined_text = "\n\n---\n\n".join(document_texts)
+        original_length = len(combined_text)
+        
+        # 如果文本超过5万字，截断（仅影响传给LLM的内容，不影响图谱构建）
+        if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM:
+            combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM]
+            combined_text += f"\n\n...(原文共{original_length}字，已截取前{self.MAX_TEXT_LENGTH_FOR_LLM}字用于本体分析)..."
+        
+        message = f"""## 模拟需求
+
+{simulation_requirement}
+
+## 文档内容
+
+{combined_text}
+"""
+        
+        if additional_context:
+            message += f"""
+## 额外说明
+
+{additional_context}
+"""
+        
+        message += """
+请根据以上内容，设计适合社会舆论模拟的实体类型和关系类型。
+记住：所有实体类型必须是现实中可以发声的主体，不能是抽象概念。
+"""
+        
+        return message
+    
+    def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]:
+        """验证和后处理结果"""
+        
+        # 确保必要字段存在
+        if "entity_types" not in result:
+            result["entity_types"] = []
+        if "edge_types" not in result:
+            result["edge_types"] = []
+        if "analysis_summary" not in result:
+            result["analysis_summary"] = ""
+        
+        # 验证实体类型
+        for entity in result["entity_types"]:
+            if "attributes" not in entity:
+                entity["attributes"] = []
+            if "examples" not in entity:
+                entity["examples"] = []
+            # 确保description不超过100字符
+            if len(entity.get("description", "")) > 100:
+                entity["description"] = entity["description"][:97] + "..."
+        
+        # 验证关系类型
+        for edge in result["edge_types"]:
+            if "source_targets" not in edge:
+                edge["source_targets"] = []
+            if "attributes" not in edge:
+                edge["attributes"] = []
+            if len(edge.get("description", "")) > 100:
+                edge["description"] = edge["description"][:97] + "..."
+        
+        return result
+    
+    def generate_python_code(self, ontology: Dict[str, Any]) -> str:
+        """
+        将本体定义转换为Python代码（类似ontology.py）
+        
+        Args:
+            ontology: 本体定义
+            
+        Returns:
+            Python代码字符串
+        """
+        code_lines = [
+            '"""',
+            '自定义实体类型定义',
+            '由MiroFish自动生成，用于社会舆论模拟',
+            '"""',
+            '',
+            'from pydantic import Field',
+            'from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel',
+            '',
+            '',
+            '# ============== 实体类型定义 ==============',
+            '',
+        ]
+        
+        # 生成实体类型
+        for entity in ontology.get("entity_types", []):
+            name = entity["name"]
+            desc = entity.get("description", f"A {name} entity.")
+            
+            code_lines.append(f'class {name}(EntityModel):')
+            code_lines.append(f'    """{desc}"""')
+            
+            attrs = entity.get("attributes", [])
+            if attrs:
+                for attr in attrs:
+                    attr_name = attr["name"]
+                    attr_desc = attr.get("description", attr_name)
+                    code_lines.append(f'    {attr_name}: EntityText = Field(')
+                    code_lines.append(f'        description="{attr_desc}",')
+                    code_lines.append(f'        default=None')
+                    code_lines.append(f'    )')
+            else:
+                code_lines.append('    pass')
+            
+            code_lines.append('')
+            code_lines.append('')
+        
+        code_lines.append('# ============== 关系类型定义 ==============')
+        code_lines.append('')
+        
+        # 生成关系类型
+        for edge in ontology.get("edge_types", []):
+            name = edge["name"]
+            # 转换为PascalCase类名
+            class_name = ''.join(word.capitalize() for word in name.split('_'))
+            desc = edge.get("description", f"A {name} relationship.")
+            
+            code_lines.append(f'class {class_name}(EdgeModel):')
+            code_lines.append(f'    """{desc}"""')
+            
+            attrs = edge.get("attributes", [])
+            if attrs:
+                for attr in attrs:
+                    attr_name = attr["name"]
+                    attr_desc = attr.get("description", attr_name)
+                    code_lines.append(f'    {attr_name}: EntityText = Field(')
+                    code_lines.append(f'        description="{attr_desc}",')
+                    code_lines.append(f'        default=None')
+                    code_lines.append(f'    )')
+            else:
+                code_lines.append('    pass')
+            
+            code_lines.append('')
+            code_lines.append('')
+        
+        # 生成类型字典
+        code_lines.append('# ============== 类型配置 ==============')
+        code_lines.append('')
+        code_lines.append('ENTITY_TYPES = {')
+        for entity in ontology.get("entity_types", []):
+            name = entity["name"]
+            code_lines.append(f'    "{name}": {name},')
+        code_lines.append('}')
+        code_lines.append('')
+        code_lines.append('EDGE_TYPES = {')
+        for edge in ontology.get("edge_types", []):
+            name = edge["name"]
+            class_name = ''.join(word.capitalize() for word in name.split('_'))
+            code_lines.append(f'    "{name}": {class_name},')
+        code_lines.append('}')
+        code_lines.append('')
+        
+        # 生成边的source_targets映射
+        code_lines.append('EDGE_SOURCE_TARGETS = {')
+        for edge in ontology.get("edge_types", []):
+            name = edge["name"]
+            source_targets = edge.get("source_targets", [])
+            if source_targets:
+                st_list = ', '.join([
+                    f'{{"source": "{st.get("source", "Entity")}", "target": "{st.get("target", "Entity")}"}}'
+                    for st in source_targets
+                ])
+                code_lines.append(f'    "{name}": [{st_list}],')
+        code_lines.append('}')
+        
+        return '\n'.join(code_lines)
+
--- a/backend/app/services/text_processor.py
+++ b/backend/app/services/text_processor.py
@@ -0,0 +1,71 @@
+"""
+文本处理服务
+"""
+
+from typing import List, Optional
+from ..utils.file_parser import FileParser, split_text_into_chunks
+
+
+class TextProcessor:
+    """文本处理器"""
+    
+    @staticmethod
+    def extract_from_files(file_paths: List[str]) -> str:
+        """从多个文件提取文本"""
+        return FileParser.extract_from_multiple(file_paths)
+    
+    @staticmethod
+    def split_text(
+        text: str,
+        chunk_size: int = 500,
+        overlap: int = 50
+    ) -> List[str]:
+        """
+        分割文本
+        
+        Args:
+            text: 原始文本
+            chunk_size: 块大小
+            overlap: 重叠大小
+            
+        Returns:
+            文本块列表
+        """
+        return split_text_into_chunks(text, chunk_size, overlap)
+    
+    @staticmethod
+    def preprocess_text(text: str) -> str:
+        """
+        预处理文本
+        - 移除多余空白
+        - 标准化换行
+        
+        Args:
+            text: 原始文本
+            
+        Returns:
+            处理后的文本
+        """
+        import re
+        
+        # 标准化换行
+        text = text.replace('\r\n', '\n').replace('\r', '\n')
+        
+        # 移除连续空行（保留最多两个换行）
+        text = re.sub(r'\n{3,}', '\n\n', text)
+        
+        # 移除行首行尾空白
+        lines = [line.strip() for line in text.split('\n')]
+        text = '\n'.join(lines)
+        
+        return text.strip()
+    
+    @staticmethod
+    def get_text_stats(text: str) -> dict:
+        """获取文本统计信息"""
+        return {
+            "total_chars": len(text),
+            "total_lines": text.count('\n') + 1,
+            "total_words": len(text.split()),
+        }
+