Import 9 alphaear finance skills

- alphaear-deepear-lite: DeepEar Lite API integration - alphaear-logic-visualizer: Draw.io XML finance diagrams - alphaear-news: Real-time finance news (10+ sources) - alphaear-predictor: Kronos time-series forecasting - alphaear-reporter: Professional financial reports - alphaear-search: Web search + local RAG - alphaear-sentiment: FinBERT/LLM sentiment analysis - alphaear-signal-tracker: Signal evolution tracking - alphaear-stock: A-Share/HK/US stock data Updates: - All scripts updated to use universal .env path - Added JINA_API_KEY, LLM_*, DEEPSEEK_API_KEY to .env.example - Updated load_dotenv() to use ~/.config/opencode/.env
2026-03-27 10:11:37 +07:00
parent 7edf5bc4d0
commit 58f9380ec4
149 changed files with 26867 additions and 0 deletions
--- a/skills/alphaear-reporter/scripts/utils/hybrid_search.py
+++ b/skills/alphaear-reporter/scripts/utils/hybrid_search.py
@@ -0,0 +1,216 @@
+import numpy as np
+import os
+from typing import List, Dict, Any, Optional, Union
+from rank_bm25 import BM25Okapi
+from loguru import logger
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+
+class HybridSearcher:
+    """
+    统一混合检索引擎 (Hybrid RAG)
+    实现 BM25 (文本) + 向量 (语义) 的融合搜索 (RRF)
+    """
+    
+    def __init__(self, data: List[Dict[str, Any]], text_fields: List[str] = ["title", "content"], model_name: str = None):
+        """
+        初始化搜索器
+        
+        Args:
+            data: 数据列表，每个元素为 Dict
+            text_fields: 用于建立索引的文本字段
+            model_name: 向量模型名称，默认使用 paraphrase-multilingual-MiniLM-L12-v2
+        """
+        self.data = data
+        self.text_fields = text_fields
+        self._corpus = []
+        self._bm25 = None
+        self._vector_model = None
+        self._embeddings = None
+        self._fitted = False
+        self._vector_fitted = False
+        
+        # 默认模型
+        self.model_name = model_name or os.getenv("EMBEDDING_MODEL", "paraphrase-multilingual-MiniLM-L12-v2")
+        
+        if data:
+            self._prepare_corpus()
+            self._fit_bm25()
+            # 延迟加载向量模型，仅在需要时或初始化时显式调用
+            # self._fit_vector() 
+
+    def _prepare_corpus(self):
+        """准备语料库用于分词"""
+        import jieba  # 使用 jieba 进行中文分词
+        
+        self._corpus = []
+        self._full_texts = []
+        for item in self.data:
+            text = " ".join([str(item.get(field, "")) for field in self.text_fields])
+            self._full_texts.append(text)
+            # 中文分词优化
+            tokens = list(jieba.cut(text))
+            self._corpus.append(tokens)
+
+    def _fit_bm25(self):
+        """训练 BM25 模型"""
+        if self._corpus:
+            self._bm25 = BM25Okapi(self._corpus)
+            self._fitted = True
+            logger.info(f"✅ BM25 index fitted with {len(self.data)} documents")
+
+    def _fit_vector(self):
+        """训练向量模型并生成 Embeddings"""
+        if not self.data:
+            return
+            
+        try:
+            logger.info(f"📡 Loading embedding model: {self.model_name}...")
+            self._vector_model = SentenceTransformer(self.model_name)
+            logger.info(f"🧠 Encoding {len(self._full_texts)} documents...")
+            self._embeddings = self._vector_model.encode(self._full_texts, show_progress_bar=False)
+            self._vector_fitted = True
+            logger.info("✅ Vector index fitted successfully")
+        except Exception as e:
+            logger.error(f"❌ Failed to fit vector index: {e}")
+            self._vector_fitted = False
+
+    def _compute_rrf(self, rank_lists: List[List[int]], k: int = 60) -> List[tuple]:
+        """
+        计算 Reciprocal Rank Fusion (RRF)
+        
+        Args:
+            rank_lists: 多个排序后的索引列表
+            k: RRF 常数，默认 60
+        """
+        scores = {}
+        for rank_list in rank_lists:
+            for rank, idx in enumerate(rank_list):
+                if idx not in scores:
+                    scores[idx] = 0
+                scores[idx] += 1.0 / (k + rank + 1)
+        
+        # 按分数排序
+        sorted_indices = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+        return sorted_indices
+
+    def search(self, query: str, top_n: int = 5, use_vector: bool = False) -> List[Dict[str, Any]]:
+        """
+        执行混合搜索
+        
+        Args:
+            query: 搜索关键词
+            top_n: 返回结果数量
+            use_vector: 是否启用向量搜索
+        """
+        if not self._fitted or not query:
+            return []
+        
+        import jieba
+        query_tokens = list(jieba.cut(query))
+        
+        # 1. BM25 搜索结果
+        bm25_scores = self._bm25.get_scores(query_tokens)
+        bm25_rank = np.argsort(bm25_scores)[::-1].tolist()
+        
+        rank_lists = [bm25_rank]
+        
+        # 2. 向量搜索逻辑
+        if use_vector:
+            if not self._vector_fitted:
+                self._fit_vector()
+            
+            if self._vector_fitted:
+                query_embedding = self._vector_model.encode([query], show_progress_bar=False)
+                similarities = cosine_similarity(query_embedding, self._embeddings)[0]
+                vector_rank = np.argsort(similarities)[::-1].tolist()
+                rank_lists.append(vector_rank)
+            else:
+                logger.warning("Vector search requested but model not fitted, falling back to BM25")
+        
+        # 3. 融合排序 (RRF)
+        if len(rank_lists) > 1:
+            rrf_results = self._compute_rrf(rank_lists)
+            # RRF 返回 (idx, score) 列表
+            final_rank = [idx for idx, score in rrf_results]
+        else:
+            final_rank = bm25_rank
+        
+        # 返回前 top_n 条结果
+        results = [self.data[idx].copy() for idx in final_rank[:top_n]]
+        
+        # 为每个结果注入相关性评分
+        for i, res in enumerate(results):
+            try:
+                original_idx = final_rank[i]
+                res["_search_score"] = bm25_scores[original_idx]
+                if use_vector and self._vector_fitted:
+                    res["_vector_score"] = float(similarities[original_idx])
+            except:
+                res["_search_score"] = 0
+            
+        return results
+
+class InMemoryRAG(HybridSearcher):
+    """专门用于 ReportAgent 跨章节检索的内存态 RAG"""
+    
+    def search(self, query: str, top_n: int = 3, use_vector: bool = True) -> List[Dict[str, Any]]:
+        """默认开启向量搜索的内存检索"""
+        return super().search(query, top_n=top_n, use_vector=use_vector)
+
+    def update_data(self, new_data: List[Dict[str, Any]]):
+        """动态更新数据并重新训练索引"""
+        self.data = new_data
+        self._prepare_corpus()
+        self._fit_bm25()
+        # 如果之前已经加载过向量模型，则更新向量索引
+        if self._vector_model:
+            self._fit_vector()
+        logger.info(f"🔄 InMemoryRAG updated with {len(new_data)} items")
+
+class LocalNewsSearch(HybridSearcher):
+    """持久态 RAG：检索数据库中的历史新闻"""
+    
+    def __init__(self, db_manager):
+        """
+        Args:
+            db_manager: DatabaseManager 实例
+        """
+        self.db = db_manager
+        # 初始时不加载数据，需调用 load_history
+        super().__init__([], ["title", "content"])
+    
+    def load_history(self, days: int = 30, limit: int = 1000):
+        """从数据库加载最近 N 天的新闻构建索引"""
+        try:
+            # 假设 db_manager 有 execute_query
+            query = f"SELECT title, content, publish_time, source FROM daily_news ORDER BY publish_time DESC LIMIT ?"
+            results = self.db.execute_query(query, (limit,))
+            
+            data = []
+            for row in results:
+                # 转换 Row 为 Dict
+                if hasattr(row, 'keys'):
+                    item = dict(row)
+                else:
+                    item = {
+                        "title": row[0], 
+                        "content": row[1], 
+                        "publish_time": row[2],
+                        "source": row[3]
+                    }
+                data.append(item)
+            
+            self.data = data
+            self._prepare_corpus()
+            self._fit_bm25()
+            # 默认不立即训练向量，等到第一次搜索时按需训练
+            logger.info(f"📚 LocalNewsSearch loaded {len(data)} items from history")
+        except Exception as e:
+            logger.error(f"Failed to load history for search: {e}")
+
+    def search(self, query: str, top_n: int = 5, use_vector: bool = True) -> List[Dict[str, Any]]:
+        """执行本地历史搜索，默认开启向量搜索"""
+        if not self.data:
+            self.load_history()
+        return super().search(query, top_n=top_n, use_vector=use_vector)