import hashlib import json import re import pandas as pd from typing import List, Dict, Any, Optional from loguru import logger from types import SimpleNamespace from .utils.database_manager import DatabaseManager from .utils.json_utils import extract_json class ReportUtils: """ 研报辅助工具集 (ReportUtils) 提供格式化、引用管理、 JSON 提取等辅助功能。 核心生成逻辑(聚类、写作)已移交 Agent 执行。 """ def __init__(self, db: DatabaseManager): self.db = db logger.info("📝 ReportUtils initialized") @staticmethod def _make_cite_key(url: str, title: str = "", source_name: str = "") -> str: basis = (url or "").strip() or f"{(title or '').strip()}|{(source_name or '').strip()}" digest = hashlib.sha1(basis.encode("utf-8")).hexdigest()[:8] return f"SF-{digest}" def build_bibliography(self, signals: List[Any]) -> tuple[list[Dict[str, Any]], Dict[int, list[str]]]: """Build stable bibliography entries and per-signal cite key mapping.""" bib_by_key: Dict[str, Dict[str, Any]] = {} signal_to_keys: Dict[int, list[str]] = {} for sig_idx, signal in enumerate(signals, 1): source_items: list[Dict[str, Any]] = [] if hasattr(signal, "sources") and getattr(signal, "sources"): source_items = list(getattr(signal, "sources") or []) elif isinstance(signal, dict) and signal.get("sources"): src_list = signal.get("sources") if isinstance(src_list, list) and src_list: source_items = list(src_list) elif isinstance(signal, dict): if signal.get("url") or signal.get("title"): source_items = [ { "title": signal.get("title"), "url": signal.get("url"), "source_name": signal.get("source") or signal.get("source_name"), "publish_time": signal.get("publish_time"), } ] if not source_items: continue for src in source_items: url = (src.get("url") or "").strip() title = (src.get("title") or "").strip() source_name = (src.get("source_name") or src.get("source") or "").strip() publish_time = (src.get("publish_time") or "").strip() if isinstance(src.get("publish_time"), str) else src.get("publish_time") key = self._make_cite_key(url=url, title=title, source_name=source_name) signal_to_keys.setdefault(sig_idx, []) if key not in signal_to_keys[sig_idx]: signal_to_keys[sig_idx].append(key) if key in bib_by_key: continue # Prefer canonical metadata from DB when possible enriched = self.db.lookup_reference_by_url(url) if url else None bib_by_key[key] = { "key": key, "url": url or (enriched.get("url") if enriched else ""), "title": (enriched.get("title") if enriched else None) or title or "(无标题)", "source": (enriched.get("source") if enriched else None) or source_name or "(未知来源)", "publish_time": (enriched.get("publish_time") if enriched else None) or publish_time or "", } return list(bib_by_key.values()), signal_to_keys @staticmethod def render_references_section(bib_entries: list[Dict[str, Any]]) -> str: lines = ["## 参考文献", ""] if not bib_entries: lines.append("(无)") return "\n".join(lines).strip() + "\n" for i, entry in enumerate(bib_entries, 1): key = entry.get("key") title = entry.get("title") or "(无标题)" source = entry.get("source") or "(未知来源)" url = entry.get("url") or "" publish_time = entry.get("publish_time") or "" suffix = "" if publish_time: suffix = f",{publish_time}" label = f"[{i}]" if url: lines.append(f"{label} {title} ({source}{suffix}), {url}") else: lines.append(f"{label} {title} ({source}{suffix})") return "\n".join(lines).strip() + "\n" @staticmethod def sanitize_json_chart_blocks(text: str) -> str: """Best-effort repair for malformed json-chart fenced blocks.""" if not text: return text # (Simplified logic: if closing ``` is missing, append it) # Full logic omitted for brevity as it was complex regex, but retaining simple closure fix if "```json-chart" in text and text.count("```") % 2 != 0: text += "\n```" return text @staticmethod def build_structured_report(report_md: str, signals: List[Dict[str, Any]], clusters: List[Dict[str, Any]]) -> Dict[str, Any]: """构建结构化研报输出(便于前端渲染/JSON化)""" text = (report_md or "").strip() lines = text.splitlines() if text else [] title = "研报" for line in lines: if line.startswith("# "): title = line.replace("# ", "").strip() break sections: List[Dict[str, Any]] = [] current: Dict[str, Any] | None = None for line in lines: heading = re.match(r"^(#{2,4})\s+(.*)$", line.strip()) if heading: if current: sections.append(current) current = {"title": heading.group(2).strip(), "content": []} continue if current is None: current = {"title": "摘要", "content": []} current["content"].append(line) if current: sections.append(current) bullets = [ re.sub(r"^[-*•]\s+", "", l.strip()) for l in lines if l.strip().startswith(("- ", "* ", "• ")) ] bullets = [b for b in bullets if b] return { "title": title, "summary_bullets": bullets[:8], "sections": [ {"title": s["title"], "content": "\n".join(s["content"]).strip()} for s in sections ] } @staticmethod def _clean_ticker(ticker_raw: str) -> str: t = (ticker_raw or "").strip() if not t: return "" digits = "".join([c for c in t if c.isdigit()]) return digits or t