Introduce Project ID for context management, finalizing the stateful API pipeline from file submission to graph construction.

2025-11-28 17:21:08 +08:00
parent 9657061b26
commit 08f417f3b7
20 changed files with 2850 additions and 1 deletions
--- a/backend/app/utils/file_parser.py
+++ b/backend/app/utils/file_parser.py
@@ -0,0 +1,141 @@
+"""
+文件解析工具
+支持PDF、Markdown、TXT文件的文本提取
+"""
+
+import os
+from pathlib import Path
+from typing import List, Optional
+
+
+class FileParser:
+    """文件解析器"""
+    
+    SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'}
+    
+    @classmethod
+    def extract_text(cls, file_path: str) -> str:
+        """
+        从文件中提取文本
+        
+        Args:
+            file_path: 文件路径
+            
+        Returns:
+            提取的文本内容
+        """
+        path = Path(file_path)
+        
+        if not path.exists():
+            raise FileNotFoundError(f"文件不存在: {file_path}")
+        
+        suffix = path.suffix.lower()
+        
+        if suffix not in cls.SUPPORTED_EXTENSIONS:
+            raise ValueError(f"不支持的文件格式: {suffix}")
+        
+        if suffix == '.pdf':
+            return cls._extract_from_pdf(file_path)
+        elif suffix in {'.md', '.markdown'}:
+            return cls._extract_from_md(file_path)
+        elif suffix == '.txt':
+            return cls._extract_from_txt(file_path)
+        
+        raise ValueError(f"无法处理的文件格式: {suffix}")
+    
+    @staticmethod
+    def _extract_from_pdf(file_path: str) -> str:
+        """从PDF提取文本"""
+        try:
+            import fitz  # PyMuPDF
+        except ImportError:
+            raise ImportError("需要安装PyMuPDF: pip install PyMuPDF")
+        
+        text_parts = []
+        with fitz.open(file_path) as doc:
+            for page in doc:
+                text = page.get_text()
+                if text.strip():
+                    text_parts.append(text)
+        
+        return "\n\n".join(text_parts)
+    
+    @staticmethod
+    def _extract_from_md(file_path: str) -> str:
+        """从Markdown提取文本"""
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    
+    @staticmethod
+    def _extract_from_txt(file_path: str) -> str:
+        """从TXT提取文本"""
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    
+    @classmethod
+    def extract_from_multiple(cls, file_paths: List[str]) -> str:
+        """
+        从多个文件提取文本并合并
+        
+        Args:
+            file_paths: 文件路径列表
+            
+        Returns:
+            合并后的文本
+        """
+        all_texts = []
+        
+        for i, file_path in enumerate(file_paths, 1):
+            try:
+                text = cls.extract_text(file_path)
+                filename = Path(file_path).name
+                all_texts.append(f"=== 文档 {i}: {filename} ===\n{text}")
+            except Exception as e:
+                all_texts.append(f"=== 文档 {i}: {file_path} (提取失败: {str(e)}) ===")
+        
+        return "\n\n".join(all_texts)
+
+
+def split_text_into_chunks(
+    text: str, 
+    chunk_size: int = 500, 
+    overlap: int = 50
+) -> List[str]:
+    """
+    将文本分割成小块
+    
+    Args:
+        text: 原始文本
+        chunk_size: 每块的字符数
+        overlap: 重叠字符数
+        
+    Returns:
+        文本块列表
+    """
+    if len(text) <= chunk_size:
+        return [text] if text.strip() else []
+    
+    chunks = []
+    start = 0
+    
+    while start < len(text):
+        end = start + chunk_size
+        
+        # 尝试在句子边界处分割
+        if end < len(text):
+            # 查找最近的句子结束符
+            for sep in ['。', '！', '？', '.\n', '!\n', '?\n', '\n\n', '. ', '! ', '? ']:
+                last_sep = text[start:end].rfind(sep)
+                if last_sep != -1 and last_sep > chunk_size * 0.3:
+                    end = start + last_sep + len(sep)
+                    break
+        
+        chunk = text[start:end].strip()
+        if chunk:
+            chunks.append(chunk)
+        
+        # 下一个块从重叠位置开始
+        start = end - overlap if end < len(text) else len(text)
+    
+    return chunks
+