Introduce Project ID for context management, finalizing the stateful API pipeline from file submission to graph construction.

This commit is contained in:
666ghj
2025-11-28 17:21:08 +08:00
parent 9657061b26
commit 08f417f3b7
20 changed files with 2850 additions and 1 deletions

View File

@@ -0,0 +1,9 @@
"""
数据模型模块
"""
from .task import TaskManager, TaskStatus
from .project import Project, ProjectStatus, ProjectManager
__all__ = ['TaskManager', 'TaskStatus', 'Project', 'ProjectStatus', 'ProjectManager']

View File

@@ -0,0 +1,305 @@
"""
项目上下文管理
用于在服务端持久化项目状态,避免前端在接口间传递大量数据
"""
import os
import json
import uuid
import shutil
from datetime import datetime
from typing import Dict, Any, List, Optional
from enum import Enum
from dataclasses import dataclass, field, asdict
from ..config import Config
class ProjectStatus(str, Enum):
"""项目状态"""
CREATED = "created" # 刚创建,文件已上传
ONTOLOGY_GENERATED = "ontology_generated" # 本体已生成
GRAPH_BUILDING = "graph_building" # 图谱构建中
GRAPH_COMPLETED = "graph_completed" # 图谱构建完成
FAILED = "failed" # 失败
@dataclass
class Project:
"""项目数据模型"""
project_id: str
name: str
status: ProjectStatus
created_at: str
updated_at: str
# 文件信息
files: List[Dict[str, str]] = field(default_factory=list) # [{filename, path, size}]
total_text_length: int = 0
# 本体信息接口1生成后填充
ontology: Optional[Dict[str, Any]] = None
analysis_summary: Optional[str] = None
# 图谱信息接口2完成后填充
graph_id: Optional[str] = None
graph_build_task_id: Optional[str] = None
# 配置
simulation_requirement: Optional[str] = None
chunk_size: int = 500
chunk_overlap: int = 50
# 错误信息
error: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
"""转换为字典"""
return {
"project_id": self.project_id,
"name": self.name,
"status": self.status.value if isinstance(self.status, ProjectStatus) else self.status,
"created_at": self.created_at,
"updated_at": self.updated_at,
"files": self.files,
"total_text_length": self.total_text_length,
"ontology": self.ontology,
"analysis_summary": self.analysis_summary,
"graph_id": self.graph_id,
"graph_build_task_id": self.graph_build_task_id,
"simulation_requirement": self.simulation_requirement,
"chunk_size": self.chunk_size,
"chunk_overlap": self.chunk_overlap,
"error": self.error
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'Project':
"""从字典创建"""
status = data.get('status', 'created')
if isinstance(status, str):
status = ProjectStatus(status)
return cls(
project_id=data['project_id'],
name=data.get('name', 'Unnamed Project'),
status=status,
created_at=data.get('created_at', ''),
updated_at=data.get('updated_at', ''),
files=data.get('files', []),
total_text_length=data.get('total_text_length', 0),
ontology=data.get('ontology'),
analysis_summary=data.get('analysis_summary'),
graph_id=data.get('graph_id'),
graph_build_task_id=data.get('graph_build_task_id'),
simulation_requirement=data.get('simulation_requirement'),
chunk_size=data.get('chunk_size', 500),
chunk_overlap=data.get('chunk_overlap', 50),
error=data.get('error')
)
class ProjectManager:
"""项目管理器 - 负责项目的持久化存储和检索"""
# 项目存储根目录
PROJECTS_DIR = os.path.join(Config.UPLOAD_FOLDER, 'projects')
@classmethod
def _ensure_projects_dir(cls):
"""确保项目目录存在"""
os.makedirs(cls.PROJECTS_DIR, exist_ok=True)
@classmethod
def _get_project_dir(cls, project_id: str) -> str:
"""获取项目目录路径"""
return os.path.join(cls.PROJECTS_DIR, project_id)
@classmethod
def _get_project_meta_path(cls, project_id: str) -> str:
"""获取项目元数据文件路径"""
return os.path.join(cls._get_project_dir(project_id), 'project.json')
@classmethod
def _get_project_files_dir(cls, project_id: str) -> str:
"""获取项目文件存储目录"""
return os.path.join(cls._get_project_dir(project_id), 'files')
@classmethod
def _get_project_text_path(cls, project_id: str) -> str:
"""获取项目提取文本存储路径"""
return os.path.join(cls._get_project_dir(project_id), 'extracted_text.txt')
@classmethod
def create_project(cls, name: str = "Unnamed Project") -> Project:
"""
创建新项目
Args:
name: 项目名称
Returns:
新创建的Project对象
"""
cls._ensure_projects_dir()
project_id = f"proj_{uuid.uuid4().hex[:12]}"
now = datetime.now().isoformat()
project = Project(
project_id=project_id,
name=name,
status=ProjectStatus.CREATED,
created_at=now,
updated_at=now
)
# 创建项目目录结构
project_dir = cls._get_project_dir(project_id)
files_dir = cls._get_project_files_dir(project_id)
os.makedirs(project_dir, exist_ok=True)
os.makedirs(files_dir, exist_ok=True)
# 保存项目元数据
cls.save_project(project)
return project
@classmethod
def save_project(cls, project: Project) -> None:
"""保存项目元数据"""
project.updated_at = datetime.now().isoformat()
meta_path = cls._get_project_meta_path(project.project_id)
with open(meta_path, 'w', encoding='utf-8') as f:
json.dump(project.to_dict(), f, ensure_ascii=False, indent=2)
@classmethod
def get_project(cls, project_id: str) -> Optional[Project]:
"""
获取项目
Args:
project_id: 项目ID
Returns:
Project对象如果不存在返回None
"""
meta_path = cls._get_project_meta_path(project_id)
if not os.path.exists(meta_path):
return None
with open(meta_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return Project.from_dict(data)
@classmethod
def list_projects(cls, limit: int = 50) -> List[Project]:
"""
列出所有项目
Args:
limit: 返回数量限制
Returns:
项目列表,按创建时间倒序
"""
cls._ensure_projects_dir()
projects = []
for project_id in os.listdir(cls.PROJECTS_DIR):
project = cls.get_project(project_id)
if project:
projects.append(project)
# 按创建时间倒序排序
projects.sort(key=lambda p: p.created_at, reverse=True)
return projects[:limit]
@classmethod
def delete_project(cls, project_id: str) -> bool:
"""
删除项目及其所有文件
Args:
project_id: 项目ID
Returns:
是否删除成功
"""
project_dir = cls._get_project_dir(project_id)
if not os.path.exists(project_dir):
return False
shutil.rmtree(project_dir)
return True
@classmethod
def save_file_to_project(cls, project_id: str, file_storage, original_filename: str) -> Dict[str, str]:
"""
保存上传的文件到项目目录
Args:
project_id: 项目ID
file_storage: Flask的FileStorage对象
original_filename: 原始文件名
Returns:
文件信息字典 {filename, path, size}
"""
files_dir = cls._get_project_files_dir(project_id)
os.makedirs(files_dir, exist_ok=True)
# 生成安全的文件名
ext = os.path.splitext(original_filename)[1].lower()
safe_filename = f"{uuid.uuid4().hex[:8]}{ext}"
file_path = os.path.join(files_dir, safe_filename)
# 保存文件
file_storage.save(file_path)
# 获取文件大小
file_size = os.path.getsize(file_path)
return {
"original_filename": original_filename,
"saved_filename": safe_filename,
"path": file_path,
"size": file_size
}
@classmethod
def save_extracted_text(cls, project_id: str, text: str) -> None:
"""保存提取的文本"""
text_path = cls._get_project_text_path(project_id)
with open(text_path, 'w', encoding='utf-8') as f:
f.write(text)
@classmethod
def get_extracted_text(cls, project_id: str) -> Optional[str]:
"""获取提取的文本"""
text_path = cls._get_project_text_path(project_id)
if not os.path.exists(text_path):
return None
with open(text_path, 'r', encoding='utf-8') as f:
return f.read()
@classmethod
def get_project_files(cls, project_id: str) -> List[str]:
"""获取项目的所有文件路径"""
files_dir = cls._get_project_files_dir(project_id)
if not os.path.exists(files_dir):
return []
return [
os.path.join(files_dir, f)
for f in os.listdir(files_dir)
if os.path.isfile(os.path.join(files_dir, f))
]

178
backend/app/models/task.py Normal file
View File

@@ -0,0 +1,178 @@
"""
任务状态管理
用于跟踪长时间运行的任务(如图谱构建)
"""
import uuid
import threading
from datetime import datetime
from enum import Enum
from typing import Dict, Any, Optional
from dataclasses import dataclass, field
class TaskStatus(str, Enum):
"""任务状态枚举"""
PENDING = "pending" # 等待中
PROCESSING = "processing" # 处理中
COMPLETED = "completed" # 已完成
FAILED = "failed" # 失败
@dataclass
class Task:
"""任务数据类"""
task_id: str
task_type: str
status: TaskStatus
created_at: datetime
updated_at: datetime
progress: int = 0 # 进度百分比 0-100
message: str = "" # 状态消息
result: Optional[Dict] = None # 任务结果
error: Optional[str] = None # 错误信息
metadata: Dict = field(default_factory=dict) # 额外元数据
def to_dict(self) -> Dict[str, Any]:
"""转换为字典"""
return {
"task_id": self.task_id,
"task_type": self.task_type,
"status": self.status.value,
"created_at": self.created_at.isoformat(),
"updated_at": self.updated_at.isoformat(),
"progress": self.progress,
"message": self.message,
"result": self.result,
"error": self.error,
"metadata": self.metadata,
}
class TaskManager:
"""
任务管理器
线程安全的任务状态管理
"""
_instance = None
_lock = threading.Lock()
def __new__(cls):
"""单例模式"""
if cls._instance is None:
with cls._lock:
if cls._instance is None:
cls._instance = super().__new__(cls)
cls._instance._tasks: Dict[str, Task] = {}
cls._instance._task_lock = threading.Lock()
return cls._instance
def create_task(self, task_type: str, metadata: Optional[Dict] = None) -> str:
"""
创建新任务
Args:
task_type: 任务类型
metadata: 额外元数据
Returns:
任务ID
"""
task_id = str(uuid.uuid4())
now = datetime.now()
task = Task(
task_id=task_id,
task_type=task_type,
status=TaskStatus.PENDING,
created_at=now,
updated_at=now,
metadata=metadata or {}
)
with self._task_lock:
self._tasks[task_id] = task
return task_id
def get_task(self, task_id: str) -> Optional[Task]:
"""获取任务"""
with self._task_lock:
return self._tasks.get(task_id)
def update_task(
self,
task_id: str,
status: Optional[TaskStatus] = None,
progress: Optional[int] = None,
message: Optional[str] = None,
result: Optional[Dict] = None,
error: Optional[str] = None
):
"""
更新任务状态
Args:
task_id: 任务ID
status: 新状态
progress: 进度
message: 消息
result: 结果
error: 错误信息
"""
with self._task_lock:
task = self._tasks.get(task_id)
if task:
task.updated_at = datetime.now()
if status is not None:
task.status = status
if progress is not None:
task.progress = progress
if message is not None:
task.message = message
if result is not None:
task.result = result
if error is not None:
task.error = error
def complete_task(self, task_id: str, result: Dict):
"""标记任务完成"""
self.update_task(
task_id,
status=TaskStatus.COMPLETED,
progress=100,
message="任务完成",
result=result
)
def fail_task(self, task_id: str, error: str):
"""标记任务失败"""
self.update_task(
task_id,
status=TaskStatus.FAILED,
message="任务失败",
error=error
)
def list_tasks(self, task_type: Optional[str] = None) -> list:
"""列出任务"""
with self._task_lock:
tasks = list(self._tasks.values())
if task_type:
tasks = [t for t in tasks if t.task_type == task_type]
return [t.to_dict() for t in sorted(tasks, key=lambda x: x.created_at, reverse=True)]
def cleanup_old_tasks(self, max_age_hours: int = 24):
"""清理旧任务"""
from datetime import timedelta
cutoff = datetime.now() - timedelta(hours=max_age_hours)
with self._task_lock:
old_ids = [
tid for tid, task in self._tasks.items()
if task.created_at < cutoff and task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED]
]
for tid in old_ids:
del self._tasks[tid]