Introduce Project ID for context management, finalizing the stateful API pipeline from file submission to graph construction.
This commit is contained in:
141
backend/app/utils/file_parser.py
Normal file
141
backend/app/utils/file_parser.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""
|
||||
文件解析工具
|
||||
支持PDF、Markdown、TXT文件的文本提取
|
||||
"""
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
class FileParser:
|
||||
"""文件解析器"""
|
||||
|
||||
SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'}
|
||||
|
||||
@classmethod
|
||||
def extract_text(cls, file_path: str) -> str:
|
||||
"""
|
||||
从文件中提取文本
|
||||
|
||||
Args:
|
||||
file_path: 文件路径
|
||||
|
||||
Returns:
|
||||
提取的文本内容
|
||||
"""
|
||||
path = Path(file_path)
|
||||
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||||
|
||||
suffix = path.suffix.lower()
|
||||
|
||||
if suffix not in cls.SUPPORTED_EXTENSIONS:
|
||||
raise ValueError(f"不支持的文件格式: {suffix}")
|
||||
|
||||
if suffix == '.pdf':
|
||||
return cls._extract_from_pdf(file_path)
|
||||
elif suffix in {'.md', '.markdown'}:
|
||||
return cls._extract_from_md(file_path)
|
||||
elif suffix == '.txt':
|
||||
return cls._extract_from_txt(file_path)
|
||||
|
||||
raise ValueError(f"无法处理的文件格式: {suffix}")
|
||||
|
||||
@staticmethod
|
||||
def _extract_from_pdf(file_path: str) -> str:
|
||||
"""从PDF提取文本"""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
raise ImportError("需要安装PyMuPDF: pip install PyMuPDF")
|
||||
|
||||
text_parts = []
|
||||
with fitz.open(file_path) as doc:
|
||||
for page in doc:
|
||||
text = page.get_text()
|
||||
if text.strip():
|
||||
text_parts.append(text)
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
@staticmethod
|
||||
def _extract_from_md(file_path: str) -> str:
|
||||
"""从Markdown提取文本"""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read()
|
||||
|
||||
@staticmethod
|
||||
def _extract_from_txt(file_path: str) -> str:
|
||||
"""从TXT提取文本"""
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return f.read()
|
||||
|
||||
@classmethod
|
||||
def extract_from_multiple(cls, file_paths: List[str]) -> str:
|
||||
"""
|
||||
从多个文件提取文本并合并
|
||||
|
||||
Args:
|
||||
file_paths: 文件路径列表
|
||||
|
||||
Returns:
|
||||
合并后的文本
|
||||
"""
|
||||
all_texts = []
|
||||
|
||||
for i, file_path in enumerate(file_paths, 1):
|
||||
try:
|
||||
text = cls.extract_text(file_path)
|
||||
filename = Path(file_path).name
|
||||
all_texts.append(f"=== 文档 {i}: {filename} ===\n{text}")
|
||||
except Exception as e:
|
||||
all_texts.append(f"=== 文档 {i}: {file_path} (提取失败: {str(e)}) ===")
|
||||
|
||||
return "\n\n".join(all_texts)
|
||||
|
||||
|
||||
def split_text_into_chunks(
|
||||
text: str,
|
||||
chunk_size: int = 500,
|
||||
overlap: int = 50
|
||||
) -> List[str]:
|
||||
"""
|
||||
将文本分割成小块
|
||||
|
||||
Args:
|
||||
text: 原始文本
|
||||
chunk_size: 每块的字符数
|
||||
overlap: 重叠字符数
|
||||
|
||||
Returns:
|
||||
文本块列表
|
||||
"""
|
||||
if len(text) <= chunk_size:
|
||||
return [text] if text.strip() else []
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
|
||||
while start < len(text):
|
||||
end = start + chunk_size
|
||||
|
||||
# 尝试在句子边界处分割
|
||||
if end < len(text):
|
||||
# 查找最近的句子结束符
|
||||
for sep in ['。', '!', '?', '.\n', '!\n', '?\n', '\n\n', '. ', '! ', '? ']:
|
||||
last_sep = text[start:end].rfind(sep)
|
||||
if last_sep != -1 and last_sep > chunk_size * 0.3:
|
||||
end = start + last_sep + len(sep)
|
||||
break
|
||||
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
# 下一个块从重叠位置开始
|
||||
start = end - overlap if end < len(text) else len(text)
|
||||
|
||||
return chunks
|
||||
|
||||
Reference in New Issue
Block a user