Files
opencode-skill/skills/alphaear-predictor/scripts/utils/search_tools.py
Kunthawat Greethong 58f9380ec4 Import 9 alphaear finance skills
- alphaear-deepear-lite: DeepEar Lite API integration
- alphaear-logic-visualizer: Draw.io XML finance diagrams
- alphaear-news: Real-time finance news (10+ sources)
- alphaear-predictor: Kronos time-series forecasting
- alphaear-reporter: Professional financial reports
- alphaear-search: Web search + local RAG
- alphaear-sentiment: FinBERT/LLM sentiment analysis
- alphaear-signal-tracker: Signal evolution tracking
- alphaear-stock: A-Share/HK/US stock data

Updates:
- All scripts updated to use universal .env path
- Added JINA_API_KEY, LLM_*, DEEPSEEK_API_KEY to .env.example
- Updated load_dotenv() to use ~/.config/opencode/.env
2026-03-27 10:11:37 +07:00

612 lines
29 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import hashlib
import json
import re
import requests
import time
import threading
from typing import List, Dict, Optional, Any
from agno.tools.duckduckgo import DuckDuckGoTools
from agno.tools.baidusearch import BaiduSearchTools
from agno.agent import Agent
from loguru import logger
from datetime import datetime
from .database_manager import DatabaseManager
from .content_extractor import ContentExtractor
from .llm.factory import get_model
from .hybrid_search import LocalNewsSearch
# 默认搜索缓存 TTL可通过环境变量覆盖
DEFAULT_SEARCH_TTL = int(os.getenv("SEARCH_CACHE_TTL", "3600")) # 默认 1 小时
class JinaSearchEngine:
"""Jina Search API 封装 - 使用 s.jina.ai 进行网络搜索"""
JINA_SEARCH_URL = "https://s.jina.ai/"
# 速率限制配置
_rate_limit_no_key = 10 # 无 key 时每分钟最大请求数
_rate_window = 60.0
_min_interval = 2.0
_request_times = []
_last_request_time = 0.0
_lock = threading.Lock()
def __init__(self):
self.api_key = os.getenv("JINA_API_KEY", "").strip()
self.has_api_key = bool(self.api_key)
if self.has_api_key:
logger.info("✅ Jina Search API key configured")
@classmethod
def _wait_for_rate_limit(cls, has_api_key: bool) -> None:
"""等待以满足速率限制"""
if has_api_key:
time.sleep(0.3)
return
with cls._lock:
current_time = time.time()
cls._request_times = [t for t in cls._request_times if current_time - t < cls._rate_window]
if len(cls._request_times) >= cls._rate_limit_no_key:
oldest = cls._request_times[0]
wait_time = cls._rate_window - (current_time - oldest) + 1.0
if wait_time > 0:
logger.warning(f"⏳ Jina Search rate limit, waiting {wait_time:.1f}s...")
time.sleep(wait_time)
current_time = time.time()
cls._request_times = [t for t in cls._request_times if current_time - t < cls._rate_window]
time_since_last = current_time - cls._last_request_time
if time_since_last < cls._min_interval:
time.sleep(cls._min_interval - time_since_last)
cls._request_times.append(time.time())
cls._last_request_time = time.time()
def search(self, query: str, max_results: int = 5) -> List[Dict]:
"""
使用 Jina Search API 执行搜索
Args:
query: 搜索关键词
max_results: 返回结果数量
Returns:
搜索结果列表,每个结果包含 title, url, content
"""
if not query:
return []
logger.info(f"🔍 Jina Search: {query}")
# 等待速率限制
self._wait_for_rate_limit(self.has_api_key)
headers = {
"Accept": "application/json",
"X-Retain-Images": "none",
}
if self.has_api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
try:
# Jina Search API: https://s.jina.ai/{query}
import urllib.parse
encoded_query = urllib.parse.quote(query)
url = f"{self.JINA_SEARCH_URL}{encoded_query}"
response = requests.get(url, headers=headers, timeout=30)
if response.status_code == 429:
logger.warning("⚠️ Jina Search rate limited (429), waiting 30s...")
time.sleep(30)
return self.search(query, max_results)
if response.status_code != 200:
logger.warning(f"Jina Search failed (Status {response.status_code})")
return []
# 解析响应
try:
data = response.json()
except json.JSONDecodeError:
# 如果返回纯文本,尝试解析
data = {"data": [{"title": "Search Result", "url": "", "content": response.text}]}
results = []
# Jina 返回格式可能是 {"data": [...]} 或直接是列表
items = data.get("data", []) if isinstance(data, dict) else data
if not isinstance(items, list):
items = [items] if items else []
for i, item in enumerate(items[:max_results]):
if isinstance(item, dict):
results.append({
"title": item.get("title", f"Result {i+1}"),
"url": item.get("url", ""),
"href": item.get("url", ""), # 兼容性
"content": item.get("content", item.get("description", "")),
"body": item.get("content", item.get("description", "")), # 兼容性
})
elif isinstance(item, str):
results.append({
"title": f"Result {i+1}",
"url": "",
"content": item
})
logger.info(f"✅ Jina Search returned {len(results)} results")
return results
except requests.exceptions.Timeout:
logger.error("Jina Search timeout")
return []
except requests.exceptions.RequestException as e:
logger.error(f"Jina Search request error: {e}")
return []
except Exception as e:
logger.error(f"Jina Search unexpected error: {e}")
return []
class SearchTools:
"""扩展性搜索工具库 - 支持多引擎聚合与内容缓存"""
def __init__(self, db: DatabaseManager):
self.db = db
# 检查 Jina API Key 是否配置
jina_api_key = os.getenv("JINA_API_KEY", "").strip()
self._jina_enabled = bool(jina_api_key)
self._engines = {
"ddg": DuckDuckGoTools(),
"baidu": BaiduSearchTools(),
"local": LocalNewsSearch(db)
}
# 如果配置了 Jina API Key添加 Jina 引擎
if self._jina_enabled:
self._engines["jina"] = JinaSearchEngine()
logger.info("🚀 Jina Search engine enabled (JINA_API_KEY configured)")
# 确定默认搜索引擎
self._default_engine = "jina" if self._jina_enabled else "ddg"
def _generate_hash(self, query: str, engine: str, max_results: int) -> str:
return hashlib.md5(f"{engine}:{query}:{max_results}".encode()).hexdigest()
def search(self, query: str, engine: str = None, max_results: int = 5, ttl: Optional[int] = None) -> str:
"""
使用指定搜索引擎执行网络搜索,结果会被缓存以提高效率。
Args:
query: 搜索关键词,如 "英伟达财报""光伏行业政策"
engine: 搜索引擎选择。可选值:
"jina" (Jina Search需配置 JINA_API_KEYLLM友好输出),
"ddg" (DuckDuckGo推荐英文/国际搜索),
"baidu" (百度,推荐中文/国内搜索),
"local" (本地历史新闻搜索,基于向量+BM25)。
默认: 若配置了 JINA_API_KEY 则使用 "jina",否则 "ddg"
max_results: 期望返回的结果数量,默认 5 条。
ttl: 缓存有效期(秒)。如果缓存超过此时间会重新搜索。
默认使用环境变量 SEARCH_CACHE_TTL 或 3600 秒。
设为 0 可强制刷新。
Returns:
搜索结果的文本描述,包含标题、摘要和链接。
"""
# 使用默认引擎(如果配置了 Jina 则优先使用 Jina
if engine is None:
engine = self._default_engine
if engine not in self._engines:
return f"Error: Unsupported engine '{engine}'. Available: {list(self._engines.keys())}"
query_hash = self._generate_hash(query, engine, max_results)
effective_ttl = ttl if ttl is not None else DEFAULT_SEARCH_TTL
# 1. 尝试从缓存读取 (local 引擎不缓存,因为它本身就是查库)
if engine != "local":
cache = self.db.get_search_cache(query_hash, ttl_seconds=effective_ttl if effective_ttl > 0 else None)
if cache and effective_ttl != 0:
logger.info(f" Found search results in cache for: {query} ({engine})")
return cache['results']
# 2. 执行真实搜索
logger.info(f"📡 Searching {engine} for: {query}")
try:
tool = self._engines[engine]
if engine == "jina":
# Jina Search 返回 List[Dict]
jina_results = tool.search(query, max_results=max_results)
results = []
for r in jina_results:
results.append({
"title": r.get("title", ""),
"href": r.get("url", ""),
"body": r.get("content", "")
})
elif engine == "ddg":
results = tool.duckduckgo_search(query, max_results=max_results)
elif engine == "baidu":
results = tool.baidu_search(query, max_results=max_results)
elif engine == "local":
# LocalNewsSearch 返回的是 List[Dict]
local_results = tool.search(query, top_n=max_results)
results = []
for r in local_results:
results.append({
"title": r.get("title"),
"href": r.get("url", "local"),
"body": r.get("content", "")
})
else:
results = "Search not implemented for this engine."
results_str = str(results)
if engine != "local":
self.db.save_search_cache(query_hash, query, engine, results_str)
return results_str
except Exception as e:
# 搜索失败时的降级策略
if engine == "jina":
logger.warning(f"⚠️ Jina search failed, falling back to ddg: {query} ({e})")
try:
return self.search(query, engine="ddg", max_results=max_results, ttl=ttl)
except Exception as e2:
logger.error(f"❌ DDG fallback also failed for {query}: {e2}")
elif engine == "ddg":
logger.warning(f"⚠️ DDG search failed, falling back to baidu: {query} ({e})")
try:
return self.search(query, engine="baidu", max_results=max_results, ttl=ttl)
except Exception as e2:
logger.error(f"❌ Baidu fallback also failed for {query}: {e2}")
logger.error(f"❌ Search failed for {query}: {e}")
return f"Error occurred during search: {str(e)}"
def search_list(self, query: str, engine: str = None, max_results: int = 5, ttl: Optional[int] = None, enrich: bool = True) -> List[Dict]:
"""
执行搜索并返回结构化列表 (List[Dict])。
Dict 包含: title, href (or url), body (or snippet)
Args:
engine: 搜索引擎默认使用配置的默认引擎Jina 优先)
enrich: 是否抓取正文内容 (默认 True)
"""
# 使用默认引擎
if engine is None:
engine = self._default_engine
if engine not in self._engines:
logger.error(f"Unsupported engine {engine}")
return []
# 不同的 hash 以区分是否 enrichment
enrich_suffix = ":enriched" if enrich else ""
query_hash = self._generate_hash(query, engine + enrich_suffix, max_results)
effective_ttl = ttl if ttl is not None else DEFAULT_SEARCH_TTL
# 1. 尝试从缓存读取
cache = self.db.get_search_cache(query_hash, ttl_seconds=effective_ttl if effective_ttl > 0 else None)
if cache and effective_ttl != 0:
try:
cached_data = json.loads(cache['results'])
if isinstance(cached_data, list):
logger.info(f" Found structured search cache for: {query}")
return cached_data
except:
pass
# 1.5 Smart Cache (Fuzzy + LLM)
if effective_ttl != 0:
try:
# 1. Similar cached queries
similar_queries = self.db.find_similar_queries(query, limit=3)
# Filter by TTL
valid_candidates = []
for q in similar_queries:
if q['query'] == query: continue
q_time = datetime.fromisoformat(q['timestamp'])
if effective_ttl and (datetime.now() - q_time).total_seconds() > effective_ttl:
continue
q['type'] = 'cached_search'
valid_candidates.append(q)
# 2. Relevant local news (as search results)
local_news = self.db.search_local_news(query, limit=3)
if local_news:
# Group local news as a single "candidate" source? Or individual?
# Better to treat "Local News Database" as one candidate source that contains X items.
# Or just add them to candidates list?
# Let's package strictly relevant news as a "local_news_bundle"
valid_candidates.append({
'type': 'local_news',
'query': 'Local Database News',
'items': local_news,
'timestamp': datetime.now().isoformat()
})
if valid_candidates:
logger.info(f"🤔 Found {len(valid_candidates)} smart cache candidates (Queries/News). Asking LLM...")
evaluation = self._evaluate_cache_relevance(query, valid_candidates)
if evaluation and evaluation.get('reuse', False):
idx = evaluation.get('index', -1)
if 0 <= idx < len(valid_candidates):
chosen = valid_candidates[idx]
logger.info(f"🤖 LLM suggested reusing: '{chosen.get('query')}' ({chosen['type']})")
if chosen['type'] == 'cached_search':
# Load the chosen cache
cache = self.db.get_search_cache(chosen['query_hash'])
if cache:
try:
cached_data = json.loads(cache['results'])
if isinstance(cached_data, list):
return cached_data
except:
pass
elif chosen['type'] == 'local_news':
# Convert local news items to search result format
news_results = []
for i, news in enumerate(chosen['items'], 1):
news_results.append({
"id": news.get('id'),
"rank": i,
"title": news.get('title'),
"url": news.get('url'),
"content": news.get('content'),
"original_snippet": news.get('content')[:200] if news.get('content') else '',
"source": f"Local News ({news.get('source')})",
"publish_time": news.get('publish_time'),
"crawl_time": news.get('crawl_time'),
"sentiment_score": news.get('sentiment_score', 0),
"meta_data": {"origin": "local_db"}
})
return news_results
except Exception as e:
logger.warning(f"Smart cache check failed: {e}")
# 2. 执行搜索
logger.info(f"📡 Searching {engine} (structured) for: {query}")
try:
tool = self._engines[engine]
results = []
if engine == "jina":
# Jina Search 直接返回结构化数据
jina_results = tool.search(query, max_results=max_results)
for r in jina_results:
results.append({
"title": r.get("title", ""),
"url": r.get("url", ""),
"href": r.get("url", ""),
"body": r.get("content", ""),
"content": r.get("content", ""),
"source": "Jina Search"
})
elif engine == "ddg":
results = tool.duckduckgo_search(query, max_results=max_results)
elif engine == "baidu":
results = tool.baidu_search(query, max_results=max_results)
elif engine == "local":
# LocalNewsSearch 返回的是 List[Dict]
local_results = tool.search(query, top_n=max_results)
results = []
for r in local_results:
results.append({
"title": r.get("title"),
"url": r.get("url", "local"),
"body": r.get("content", "")[:500],
"source": f"Local ({r.get('source', 'db')})",
"publish_time": r.get("publish_time")
})
# 处理字符串类型的 JSON 返回 (Baidu 常返 JSON 字符串)
if isinstance(results, str) and engine not in ["local", "jina"]:
try:
results = json.loads(results)
except:
pass
# 转为统一格式
normalized_results = []
if isinstance(results, list):
for i, r in enumerate(results, 1):
title = r.get('title', '')
url = r.get('href') or r.get('url') or r.get('link', '')
content = r.get('body') or r.get('snippet') or r.get('abstract', '')
if title and url:
normalized_results.append({
"id": self._generate_hash(url + query, "search_item", i),
"rank": i,
"title": title,
"url": url,
"content": content,
"original_snippet": content, # 保留摘要
"source": f"Search ({engine})",
"publish_time": datetime.now().isoformat(), # 暂用当前时间
"crawl_time": datetime.now().isoformat(),
"meta_data": {"query": query, "engine": engine}
})
# Fallback if still string and failed to parse
elif isinstance(results, str) and results:
normalized_results.append({"title": query, "url": "", "content": results, "source": engine})
# 3. 抓取正文 & 计算情绪 (Enrichment)
# 注意:如果使用 Jina Search内容已经是 LLM 友好格式,可选择跳过 enrichment
skip_content_enrichment = (engine == "jina")
if enrich and normalized_results:
logger.info(f"🕸️ Enriching {len(normalized_results)} search results with Jina & Sentiment...")
extractor = ContentExtractor()
# Lazy load sentiment tool
if not hasattr(self, 'sentiment_tool') or self.sentiment_tool is None:
from ..sentiment_tools import SentimentTools
self.sentiment_tool = SentimentTools(self.db)
for item in normalized_results:
if item.get("url"):
try:
# 如果是 Jina Search内容已经足够好跳过额外抓取
if skip_content_enrichment and item.get("content") and len(item.get("content", "")) > 100:
full_content = item["content"]
else:
# Use Jina Reader to get full content
full_content = extractor.extract_with_jina(item["url"], timeout=60)
if full_content and len(full_content) > 100:
item["content"] = full_content
# Calculate sentiment
# Use title + snippet of content for efficiency
text_to_analyze = f"{item['title']} {full_content[:500]}"
sent_result = self.sentiment_tool.analyze_sentiment(text_to_analyze) # Using self.sentiment_tool
score = sent_result.get('score', 0.0)
item["sentiment_score"] = float(score)
logger.info(f" ✅ Enriched: {item['title'][:20]}... (Sentiment: {score:.2f})")
else:
# Fallback: Use snippet for sentiment
logger.info(f" ⚠️ Content short/failed for {item['url']}, using snippet for sentiment.")
text_to_analyze = f"{item['title']} {item['content']}" # content is snippet here
sent_result = self.sentiment_tool.analyze_sentiment(text_to_analyze)
score = sent_result.get('score', 0.0)
item["sentiment_score"] = float(score)
except Exception as e:
# Fallback: Use snippet for sentiment on error
logger.warning(f"Failed to enrich {item['url']}: {e}. Using snippet.")
text_to_analyze = f"{item['title']} {item['content']}"
sent_result = self.sentiment_tool.analyze_sentiment(text_to_analyze)
score = sent_result.get('score', 0.0)
item["sentiment_score"] = float(score)
# 缓存结果 list
if normalized_results:
# Pass list directly, DB manager will handle JSON dump for main cache and populate search_details
# Only cache if NOT from local news reuse (though this logic path is for fresh search)
self.db.save_search_cache(query_hash, query, engine, normalized_results)
return normalized_results
except Exception as e:
# 搜索失败时的降级策略
if engine == "jina":
logger.warning(f"⚠️ Jina search_list failed, falling back to ddg: {query} ({e})")
try:
return self.search_list(query, engine="ddg", max_results=max_results, ttl=ttl, enrich=enrich)
except Exception as e2:
logger.error(f"❌ DDG fallback (search_list) also failed for {query}: {e2}")
elif engine == "ddg":
logger.warning(f"⚠️ DDG search_list failed, falling back to baidu: {query} ({e})")
try:
return self.search_list(query, engine="baidu", max_results=max_results, ttl=ttl, enrich=enrich)
except Exception as e2:
logger.error(f"❌ Baidu fallback (search_list) also failed for {query}: {e2}")
logger.error(f"❌ Structured search failed for {query}: {e}")
return []
def _evaluate_cache_relevance(self, current_query: str, candidates: List[Dict]) -> Dict:
"""
使用 LLM 评估缓存候选是否足以回答当前问题。
"""
try:
# Prepare candidates text
candidates_desc = []
for i, c in enumerate(candidates):
if c['type'] == 'cached_search':
# Preview cached results if available?
# Maybe just use the query string as a proxy for what's in there.
# Or peek at 'results' snippet.
preview = ""
try:
# Attempt to peek first result title from JSON string
# Note: c.get('results') might be a stringified JSON list
res_list = json.loads(c.get('results', '[]'))
if res_list and isinstance(res_list, list) and len(res_list) > 0:
first_item = res_list[0]
if isinstance(first_item, dict) and 'title' in first_item:
preview = f" (Contains: {first_item.get('title', '')[:50]}...)"
except:
pass
candidates_desc.append(f"[{i}] Old Search Query: '{c['query']}' {preview} (Time: {c['timestamp']})")
elif c['type'] == 'local_news':
# List titles of local news
titles = [item['title'] for item in c['items'][:3]]
candidates_desc.append(f"[{i}] Local Database News: {', '.join(titles)}... (Time: {c['timestamp']})")
prompt = f"""
Task: Decide if existing information is sufficient for the new search query.
New Query: "{current_query}"
Available Information Candidates:
{chr(10).join(candidates_desc)}
Instructions:
1. Analyze if any candidate provides ENOUGH up-to-date info for the "New Query".
2. If yes, choose the best one.
3. If the query implies needing LATEST real-time info and candidates are old, choose none.
4. Return strictly JSON: {{"reuse": true/false, "index": <candidate_index_int>, "reason": "short explanation"}}
"""
# 初始化模型
provider = os.getenv("LLM_PROVIDER", "ust")
model_id = os.getenv("LLM_MODEL", "Qwen")
host = os.getenv("LLM_HOST")
if host:
model = get_model(provider, model_id, host=host)
else:
model = get_model(provider, model_id)
agent = Agent(model=model, markdown=True)
response = agent.run(prompt)
content = response.content
# Parse JSON
json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL)
if json_match:
return json.loads(json_match.group(1))
elif '{' in content:
# Fallback for cases where LLM doesn't wrap in ```json
return json.loads(content[content.find('{'):content.rfind('}')+1])
return {"reuse": False}
except Exception as e:
logger.warning(f"LLM evaluation failed: {e}")
return {"reuse": False}
def aggregate_search(self, query: str, engines: Optional[List[str]] = None, max_results: int = 5) -> str:
"""
使用多个搜索引擎同时搜索并聚合结果,获得更全面的信息覆盖。
Args:
query: 搜索关键词。
engines: 要使用的搜索引擎列表。可选值: ["ddg", "baidu"]。
默认同时使用 ddg 和 baidu。
max_results: 每个引擎期望返回的结果数量。
Returns:
聚合后的搜索结果,按引擎分组显示。
"""
engines = engines or ["ddg", "baidu"]
aggregated_results = []
for engine in engines:
res = self.search(query, engine=engine, max_results=max_results)
aggregated_results.append(f"--- Results from {engine.upper()} ---\n{res}")
return "\n\n".join(aggregated_results)