diff --git a/.DS_Store b/.DS_Store index c5608d8..3936d55 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.env.example b/.env.example index 6e4008d..3c0bf3b 100644 --- a/.env.example +++ b/.env.example @@ -106,19 +106,19 @@ SHODH_USER_ID=default # JINA API - For content extraction JINA_API_KEY= -# LLM Configuration (AlphaEar uses multiple providers) -LLM_PROVIDER=ust -LLM_MODEL=Qwen +# LLM Configuration (AlphaEar uses MiniMax by default - OpenAI compatible) +LLM_PROVIDER=minimax +LLM_MODEL=MiniMax-Text-01 LLM_HOST= # Reasoning Model (for alphaear-predictor) -REASONING_MODEL_PROVIDER=openai -REASONING_MODEL_ID=gpt-4o +REASONING_MODEL_PROVIDER=minimax +REASONING_MODEL_ID=MiniMax-Text-01 REASONING_MODEL_HOST= # Tool Model (for alphaear-predictor) -TOOL_MODEL_PROVIDER=openai -TOOL_MODEL_ID=gpt-4o +TOOL_MODEL_PROVIDER=minimax +TOOL_MODEL_ID=MiniMax-Text-01 TOOL_MODEL_HOST= # Embedding Model (for Kronos predictor) diff --git a/scripts/install-openclaw-skills.sh b/scripts/install-openclaw-skills.sh index 909e247..617ed60 100755 --- a/scripts/install-openclaw-skills.sh +++ b/scripts/install-openclaw-skills.sh @@ -67,17 +67,20 @@ find_opencode_folders() { install_all_to_folder() { local target_dir="$1" - local count=0 mkdir -p "$target_dir" - for skill_dir in "$SKILLS_DIR"/*/; do - [ -d "$skill_dir" ] || continue - skill_name=$(basename "$skill_dir") - if [ -f "$skill_dir/SKILL.md" ]; then - [ -d "${target_dir}/${skill_name}" ] && rm -rf "${target_dir}/${skill_name}" - cp -r "$skill_dir" "${target_dir}/${skill_name}" - count=$((count + 1)) - fi - done + if command -v rsync &> /dev/null; then + rsync -a --delete "$SKILLS_DIR/" "$target_dir/" + else + for skill_dir in "$SKILLS_DIR"/*/; do + [ -d "$skill_dir" ] || continue + skill_name=$(basename "$skill_dir") + if [ -f "$skill_dir/SKILL.md" ]; then + [ -d "${target_dir}/${skill_name}" ] && rm -rf "${target_dir}/${skill_name}" + cp -r "$skill_dir" "${target_dir}/${skill_name}" + fi + done + fi + local count=$(ls -d "$target_dir"/*/ 2>/dev/null | wc -l | tr -d ' ') echo -e "${SUCCESS}[OK]${NC} Installed $count skills to ${target_dir}" } diff --git a/skills/SEO_SKILLS_IMPLEMENTATION_STATUS.md b/skills/SEO_SKILLS_IMPLEMENTATION_STATUS.md deleted file mode 100644 index 36c41c7..0000000 --- a/skills/SEO_SKILLS_IMPLEMENTATION_STATUS.md +++ /dev/null @@ -1,434 +0,0 @@ -# 🎯 SEO Multi-Channel Skill Set - Complete Implementation - -**Status:** Core implementation complete -**Created:** 2026-03-08 -**Based on:** SEOMachine workflow + Multi-channel requirements - ---- - -## ✅ WHAT'S BEEN CREATED - -### **1. seo-multi-channel Skill** ✅ COMPLETE - -**Location:** `skills/seo-multi-channel/` - -**Files Created:** -- `SKILL.md` - Complete documentation (828 lines) -- `scripts/generate_content.py` - Main generator with Thai support -- `scripts/templates/facebook.yaml` - Facebook organic posts -- `scripts/templates/facebook_ads.yaml` - Facebook Ads (API-ready) -- `scripts/templates/google_ads.yaml` - Google Ads (API-ready) -- `scripts/templates/blog.yaml` - SEO blog posts -- `scripts/templates/x_thread.yaml` - Twitter/X threads -- `scripts/requirements.txt` - Python dependencies -- `scripts/.env.example` - Credentials template - -**Features Implemented:** -- ✅ Thai language processing with PyThaiNLP -- ✅ 5 channels: Facebook > Facebook Ads > Google Ads > Blog > X -- ✅ Image handling (generation for non-product, edit for product) -- ✅ API-ready output structures (Meta Graph API, Google Ads API) -- ✅ Website-creator integration design -- ✅ Auto-publish to Astro content collections - ---- - -### **2. Remaining Skills (Skeleton Structure)** - -The following skills need to be created with full implementation. Below are the SKILL.md templates and key Python modules. - ---- - -## 📁 seo-analyzers Skill - -**Purpose:** Thai language content analysis and quality scoring - -### SKILL.md Template: - -```markdown ---- -name: seo-analyzers -description: Analyze content quality with Thai language support. Use for keyword density, readability scoring, and SEO quality rating (0-100). ---- - -# 🔍 SEO Analyzers - Thai Language Content Analysis - -## Purpose - -Analyze content quality with full Thai language support: -- ✅ Thai keyword density (PyThaiNLP-based) -- ✅ Thai readability scoring -- ✅ Content quality rating (0-100) -- ✅ AI pattern detection (content scrubbing) - -## Usage - -```bash -# Analyze keyword density -python3 skills/seo-analyzers/scripts/thai_keyword_analyzer.py \ - --content "article text here" \ - --keyword "บริการ podcast" - -# Score content quality -python3 skills/seo-analyzers/scripts/content_quality_scorer.py \ - --file article.md \ - --language th -``` - -## Modules - -1. **thai_keyword_analyzer.py** - Thai keyword density, distribution, clustering -2. **thai_readability.py** - Thai readability scoring (grade level, formality) -3. **content_quality_scorer.py** - Overall 0-100 quality score -4. **content_scrubber_thai.py** - Remove AI patterns (Thai-aware) - -## Thai Language Adaptations - -### Word Counting -- English: `len(text.split())` -- Thai: PyThaiNLP word_tokenize (no spaces between Thai words) - -### Readability -- English: Flesch Reading Ease -- Thai: Average sentence length + formality detection - -### Keyword Density -- Thai: 1.0-1.5% (lower due to compound words) -- English: 1.5-2.0% -``` - -### Key Python Module: thai_keyword_analyzer.py - -```python -#!/usr/bin/env python3 -"""Thai Keyword Analyzer - Keyword density for Thai text""" - -from pythainlp import word_tokenize -from pythainlp.util import normalize -from typing import Dict, List - -class ThaiKeywordAnalyzer: - """Analyze keyword density in Thai text""" - - def count_words(self, text: str) -> int: - """Count Thai words accurately""" - tokens = word_tokenize(text, engine="newmm") - return len([t for t in tokens if t.strip()]) - - def calculate_density(self, text: str, keyword: str) -> float: - """Calculate keyword density""" - text_norm = normalize(text) - keyword_norm = normalize(keyword) - count = text_norm.count(keyword_norm) - word_count = self.count_words(text) - return (count / word_count * 100) if word_count > 0 else 0 - - def analyze(self, text: str, keyword: str) -> Dict: - """Full keyword analysis""" - density = self.calculate_density(text, keyword) - - return { - 'word_count': self.count_words(text), - 'keyword': keyword, - 'occurrences': text.count(keyword), - 'density': round(density, 2), - 'status': self._get_density_status(density), - 'recommendations': self._get_recommendations(density) - } - - def _get_density_status(self, density: float) -> str: - if density < 0.5: - return "too_low" - elif density < 1.0: - return "slightly_low" - elif density <= 1.5: - return "optimal" - elif density <= 2.0: - return "slightly_high" - else: - return "too_high" - - def _get_recommendations(self, density: float) -> List[str]: - recs = [] - if density < 1.0: - recs.append("เพิ่มการใช้คำหลักในเนื้อหา (target: 1.0-1.5%)") - elif density > 2.0: - recs.append("ลดการใช้คำหลักลง อาจถูกมองว่า keyword stuffing") - return recs -``` - ---- - -## 📁 seo-data Skill - -**Purpose:** Analytics integrations (GA4, GSC, DataForSEO, Umami) - -### SKILL.md Template: - -```markdown ---- -name: seo-data -description: Connect to analytics services (GA4, GSC, DataForSEO, Umami) for performance data. Optional per-project configuration. ---- - -# 📊 SEO Data - Analytics Integrations - -## Purpose - -Connect to analytics services for content performance data: -- ✅ Google Analytics 4 (traffic, engagement) -- ✅ Google Search Console (rankings, impressions) -- ✅ DataForSEO (competitor analysis, SERP data) -- ✅ Umami Analytics (privacy-first analytics) - -## Optional Per-Project - -Each service is optional. Skill skips unconfigured services: -```python -# Check if configured -if config.get('ga4'): - data['ga4'] = ga4.get_performance(url) -# else: skip silently -``` - -## Usage - -```bash -# Get page performance from all configured services -python3 skills/seo-data/scripts/data_aggregator.py \ - --url "https://yoursite.com/blog/article" \ - --project-context "./website/context/" -``` - -## Modules - -1. **ga4_connector.py** - Google Analytics 4 API -2. **gsc_connector.py** - Google Search Console API -3. **dataforseo_client.py** - DataForSEO API -4. **umami_connector.py** - Umami Analytics API -5. **data_aggregator.py** - Combine all sources -``` - -### Key Integration Pattern: - -```python -class DataServiceManager: - """Manage optional analytics connections""" - - def __init__(self, context_path: str): - self.config = self._load_config(context_path) - self.services = {} - - # Initialize only configured services - if self.config.get('ga4_credentials'): - self.services['ga4'] = GA4Connector(self.config['ga4']) - - if self.config.get('gsc_credentials'): - self.services['gsc'] = GSCConnector(self.config['gsc']) - - # ... same for dataforseo, umami - - def get_performance(self, url: str) -> Dict: - """Aggregate data from all available services""" - data = {} - - for name, service in self.services.items(): - try: - data[name] = service.get_page_data(url) - except Exception as e: - print(f"Warning: {name} failed: {e}") - # Continue with other services - - return data -``` - ---- - -## 📁 seo-context Skill - -**Purpose:** Per-project context file management - -### SKILL.md Template: - -```markdown ---- -name: seo-context -description: Manage per-project context files (brand voice, keywords, guidelines). Each website has its own context/ folder. ---- - -# 📝 SEO Context - Per-Project Configuration - -## Purpose - -Manage context files for each website project: -- ✅ brand-voice.md - Brand voice, tone, messaging (Thai + English) -- ✅ target-keywords.md - Keyword clusters by intent -- ✅ seo-guidelines.md - SEO requirements (Thai-specific) -- ✅ internal-links-map.md - Key pages for internal linking -- ✅ style-guide.md - Writing style, formality levels - -## Per-Project Location - -Each website has its own context folder: -``` -website-name/ -└── context/ - ├── brand-voice.md - ├── target-keywords.md - ├── seo-guidelines.md - ├── internal-links-map.md - └── style-guide.md -``` - -## Usage - -```bash -# Create context files for new project -python3 skills/seo-context/scripts/context_manager.py \ - --create \ - --project "./my-website" \ - --language th - -# Update context from existing content -python3 skills/seo-context/scripts/context_manager.py \ - --update \ - --project "./my-website" \ - --analyze-existing -``` - -## Thai-Specific Context - -### brand-voice.md -- Voice pillars (Thai: เป็นกันเอง, ปกติ, เป็นทางการ) -- Tone guidelines for Thai vs English content -- Formality level auto-detection rules - -### seo-guidelines.md -- Thai keyword density: 1.0-1.5% -- Thai word count: 1500-3000 -- Thai readability: ม.6-ม.12 grade level -``` - ---- - -## 🚀 HOW TO USE THE COMPLETE SYSTEM - -### **1. Setup (One-Time)** - -```bash -# Install all skills -cd /Users/kunthawatgreethong/Gitea/opencode-skill -./scripts/install-skills.sh - -# Install Python dependencies -pip install -r skills/seo-multi-channel/scripts/requirements.txt -pip install -r skills/seo-analyzers/scripts/requirements.txt -pip install -r skills/seo-data/scripts/requirements.txt - -# Configure credentials (edit .env) -cp skills/seo-multi-channel/scripts/.env.example \ - ~/.config/opencode/.env -``` - -### **2. Generate Multi-Channel Content** - -```bash -# Example: Generate for all channels -python3 skills/seo-multi-channel/scripts/generate_content.py \ - --topic "บริการ podcast hosting" \ - --channels facebook facebook_ads google_ads blog x \ - --website-repo ./my-website \ - --auto-publish - -# Example: Facebook Ads only -python3 skills/seo-multi-channel/scripts/generate_content.py \ - --topic "podcast microphone" \ - --channels facebook_ads \ - --product-name "PodMic Pro" \ - --website-repo ./my-website -``` - -### **3. Output Structure** - -``` -output/บริการ-podcast-hosting/ -├── facebook/ -│ ├── posts.json -│ └── images/ -├── facebook_ads/ -│ ├── ads.json -│ └── images/ -├── google_ads/ -│ └── ads.json -├── blog/ -│ ├── article.md -│ └── images/ -├── x/ -│ └── thread.json -└── summary.json -``` - -### **4. Auto-Publish Blog** - -If `--auto-publish` enabled: -1. Blog saved to: `website/src/content/blog/(th)/{slug}.md` -2. Images saved to: `website/public/images/blog/{slug}/` -3. Git commit + push → triggers Easypanel auto-deploy -4. Returns deployment URL - ---- - -## 📋 NEXT STEPS TO COMPLETE - -### **Priority 1 (This Week):** -1. ✅ Complete seo-analyzers Python modules -2. ✅ Complete seo-data connectors -3. ✅ Complete seo-context manager -4. Test with real content generation - -### **Priority 2 (Next Week):** -1. Refine Thai language processing -2. Add more channel templates (LinkedIn, Instagram) -3. Integrate with actual image-generation skill -4. Integrate with actual image-edit skill -5. Test website-creator auto-publish flow - -### **Priority 3 (Future):** -1. Add actual API integration for Google Ads -2. Add actual API integration for Meta Ads -3. Add performance tracking -4. Add A/B testing support - ---- - -## ✅ WHAT WORKS NOW - -- ✅ Multi-channel content structure -- ✅ Thai language processing (with PyThaiNLP) -- ✅ Channel templates (all 5 channels) -- ✅ API-ready output structures -- ✅ Image handling design -- ✅ Website-creator integration design -- ✅ Per-project context system - -## ⚠️ WHAT NEEDS COMPLETION - -- ⚠️ Full Python implementation of all modules -- ⚠️ Actual LLM integration for content generation -- ⚠️ Image generation/edit skill calls -- ⚠️ Website-creator auto-publish implementation -- ⚠️ Testing with real Thai content - ---- - -## 📞 SUPPORT - -For issues or questions: -1. Check SKILL.md documentation -2. Review .env.example for credentials -3. Test with --help flag: `python generate_content.py --help` - ---- - -**Created based on SEOMachine workflow analysis + multi-channel requirements** -**Optimized for Thai market with full Thai language support** diff --git a/skills/alphaear-predictor/scripts/utils/llm/capability.py b/skills/alphaear-predictor/scripts/utils/llm/capability.py index 60592fd..04c9d5a 100644 --- a/skills/alphaear-predictor/scripts/utils/llm/capability.py +++ b/skills/alphaear-predictor/scripts/utils/llm/capability.py @@ -77,7 +77,7 @@ if __name__ == "__main__": load_dotenv(os.path.expanduser("~/.config/opencode/.env")) # 测试当前配置的模型 - p = os.getenv("LLM_PROVIDER", "ust") + p = os.getenv("LLM_PROVIDER", "minimax") m = os.getenv("LLM_MODEL", "Qwen") print(f"Testing {p}/{m}...") diff --git a/skills/alphaear-predictor/scripts/utils/llm/factory.py b/skills/alphaear-predictor/scripts/utils/llm/factory.py index 09b6ea5..449e5b8 100644 --- a/skills/alphaear-predictor/scripts/utils/llm/factory.py +++ b/skills/alphaear-predictor/scripts/utils/llm/factory.py @@ -5,10 +5,11 @@ from agno.models.dashscope import DashScope from agno.models.deepseek import DeepSeek from agno.models.openrouter import OpenRouter + def get_model(model_provider: str, model_id: str, **kwargs): """ Factory to get the appropriate LLM model. - + Args: model_provider: "openai", "ollama", "deepseek" model_id: The specific model ID (e.g., "gpt-4o", "llama3", "deepseek-chat") @@ -16,47 +17,51 @@ def get_model(model_provider: str, model_id: str, **kwargs): """ if model_provider == "openai": return OpenAIChat(id=model_id, **kwargs) - + elif model_provider == "ollama": return Ollama(id=model_id, **kwargs) - + + elif model_provider == "minimax": + api_key = os.getenv("MINIMAX_API_KEY") + if not api_key: + print("Warning: MINIMAX_API_KEY not set.") + + return OpenAIChat( + id=model_id, + base_url=os.getenv("MINIMAX_API_BASE", "https://api.minimax.io/v1"), + api_key=api_key, + **kwargs, + ) + elif model_provider == "deepseek": # DeepSeek is OpenAI compatible api_key = os.getenv("DEEPSEEK_API_KEY") if not api_key: print("Warning: DEEPSEEK_API_KEY not set.") - - return DeepSeek( - id=model_id, - api_key=api_key, - **kwargs - ) + + return DeepSeek(id=model_id, api_key=api_key, **kwargs) elif model_provider == "dashscope": api_key = os.getenv("DASHSCOPE_API_KEY") if not api_key: print("Warning: DASHSCOPE_API_KEY not set.") - + return DashScope( id=model_id, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", api_key=api_key, - **kwargs + **kwargs, ) - elif model_provider == 'openrouter': + elif model_provider == "openrouter": api_key = os.getenv("OPENROUTER_API_KEY") if not api_key: - print('Warning: OPENROUTER_API_KEY not set.') - - return OpenRouter( - id=model_id, - api_key=api_key, - **kwargs - ) + print("Warning: OPENROUTER_API_KEY not set.") - elif model_provider == 'zai': + return OpenRouter(id=model_id, api_key=api_key, **kwargs) + + elif model_provider == "zai": api_key = os.getenv("ZAI_KEY_API") if not api_key: - print('Warning: ZAI_KEY_API not set.') + print("Warning: ZAI_KEY_API not set.") # role_map to ensure compatibility. default_role_map = { @@ -69,22 +74,24 @@ def get_model(model_provider: str, model_id: str, **kwargs): # Allow callers to override role_map via kwargs, otherwise use default role_map = kwargs.pop("role_map", default_role_map) - + return OpenAIChat( id=model_id, base_url="https://api.z.ai/api/paas/v4", api_key=api_key, timeout=60, role_map=role_map, - extra_body={"enable_thinking": False}, # TODO: one more setting for thinking - **kwargs + extra_body={ + "enable_thinking": False + }, # TODO: one more setting for thinking + **kwargs, ) - - elif model_provider == 'ust': + + elif model_provider == "ust": api_key = os.getenv("UST_KEY_API") if not api_key: - print('Warning: UST_KEY_API not set.') - + print("Warning: UST_KEY_API not set.") + # Some UST-compatible endpoints expect the standard OpenAI role names # (e.g. "system", "user", "assistant") rather than Agno's default # mapping which maps "system" -> "developer". Provide an explicit @@ -105,10 +112,11 @@ def get_model(model_provider: str, model_id: str, **kwargs): api_key=api_key, base_url=os.getenv("UST_URL"), role_map=role_map, - extra_body={"enable_thinking": False}, # TODO: one more setting for thinking - **kwargs + extra_body={ + "enable_thinking": False + }, # TODO: one more setting for thinking + **kwargs, ) - + else: raise ValueError(f"Unknown model provider: {model_provider}") - diff --git a/skills/alphaear-predictor/scripts/utils/predictor/training.py b/skills/alphaear-predictor/scripts/utils/predictor/training.py index c305eed..e3c69f3 100644 --- a/skills/alphaear-predictor/scripts/utils/predictor/training.py +++ b/skills/alphaear-predictor/scripts/utils/predictor/training.py @@ -97,7 +97,7 @@ class AutoSynthesisTrainer: self.model.load_state_dict(base_model.state_dict(), strict=False) # LLM for causality verification - provider = os.getenv("LLM_PROVIDER", "ust") + provider = os.getenv("LLM_PROVIDER", "minimax") model_id = os.getenv("LLM_MODEL", "Qwen") self.llm_agent = Agent(model=get_model(provider, model_id)) diff --git a/skills/alphaear-predictor/scripts/utils/search_tools.py b/skills/alphaear-predictor/scripts/utils/search_tools.py index a11d99a..50b08f3 100644 --- a/skills/alphaear-predictor/scripts/utils/search_tools.py +++ b/skills/alphaear-predictor/scripts/utils/search_tools.py @@ -563,7 +563,7 @@ class SearchTools: 4. Return strictly JSON: {{"reuse": true/false, "index": , "reason": "short explanation"}} """ # 初始化模型 - provider = os.getenv("LLM_PROVIDER", "ust") + provider = os.getenv("LLM_PROVIDER", "minimax") model_id = os.getenv("LLM_MODEL", "Qwen") host = os.getenv("LLM_HOST") if host: diff --git a/skills/alphaear-reporter/scripts/utils/llm/capability.py b/skills/alphaear-reporter/scripts/utils/llm/capability.py index 60592fd..04c9d5a 100644 --- a/skills/alphaear-reporter/scripts/utils/llm/capability.py +++ b/skills/alphaear-reporter/scripts/utils/llm/capability.py @@ -77,7 +77,7 @@ if __name__ == "__main__": load_dotenv(os.path.expanduser("~/.config/opencode/.env")) # 测试当前配置的模型 - p = os.getenv("LLM_PROVIDER", "ust") + p = os.getenv("LLM_PROVIDER", "minimax") m = os.getenv("LLM_MODEL", "Qwen") print(f"Testing {p}/{m}...") diff --git a/skills/alphaear-reporter/scripts/utils/predictor/training.py b/skills/alphaear-reporter/scripts/utils/predictor/training.py index c305eed..e3c69f3 100644 --- a/skills/alphaear-reporter/scripts/utils/predictor/training.py +++ b/skills/alphaear-reporter/scripts/utils/predictor/training.py @@ -97,7 +97,7 @@ class AutoSynthesisTrainer: self.model.load_state_dict(base_model.state_dict(), strict=False) # LLM for causality verification - provider = os.getenv("LLM_PROVIDER", "ust") + provider = os.getenv("LLM_PROVIDER", "minimax") model_id = os.getenv("LLM_MODEL", "Qwen") self.llm_agent = Agent(model=get_model(provider, model_id)) diff --git a/skills/alphaear-reporter/scripts/utils/search_tools.py b/skills/alphaear-reporter/scripts/utils/search_tools.py index a11d99a..50b08f3 100644 --- a/skills/alphaear-reporter/scripts/utils/search_tools.py +++ b/skills/alphaear-reporter/scripts/utils/search_tools.py @@ -563,7 +563,7 @@ class SearchTools: 4. Return strictly JSON: {{"reuse": true/false, "index": , "reason": "short explanation"}} """ # 初始化模型 - provider = os.getenv("LLM_PROVIDER", "ust") + provider = os.getenv("LLM_PROVIDER", "minimax") model_id = os.getenv("LLM_MODEL", "Qwen") host = os.getenv("LLM_HOST") if host: diff --git a/skills/alphaear-reporter/scripts/utils/sentiment_tools.py b/skills/alphaear-reporter/scripts/utils/sentiment_tools.py index 4a84947..f4278b5 100644 --- a/skills/alphaear-reporter/scripts/utils/sentiment_tools.py +++ b/skills/alphaear-reporter/scripts/utils/sentiment_tools.py @@ -9,23 +9,29 @@ from .database_manager import DatabaseManager # 从环境变量读取默认情绪分析模式 DEFAULT_SENTIMENT_MODE = os.getenv("SENTIMENT_MODE", "auto") # auto, bert, llm + class SentimentTools: """ 情绪分析工具 - 支持 LLM 和 BERT 两种模式 - + 模式说明: - "auto": 自动选择,优先使用 BERT(速度快),不可用时回退到 LLM - "bert": 强制使用 BERT 模型(需要 transformers 库) - "llm": 强制使用 LLM(更准确但较慢) - + 可通过环境变量 SENTIMENT_MODE 设置默认模式。 """ - - def __init__(self, db: DatabaseManager, mode: Optional[str] = None, - model_provider: str = "openai", model_id: str = "gpt-4o"): + + def __init__( + self, + db: DatabaseManager, + mode: Optional[str] = None, + model_provider: str = "openai", + model_id: str = "gpt-4o", + ): """ 初始化情绪分析工具。 - + Args: db: 数据库管理器实例 mode: 分析模式,可选 "auto", "bert", "llm"。None 则使用环境变量默认值。 @@ -36,11 +42,15 @@ class SentimentTools: self.mode = mode or DEFAULT_SENTIMENT_MODE self.llm_model = None self.bert_pipeline = None - + # Initialize LLM try: - provider = "ust" if os.getenv("UST_KEY_API") else model_provider - m_id = "Qwen" if provider == "ust" else model_id + provider = "minimax" if os.getenv("MINIMAX_API_KEY") else model_provider + m_id = ( + os.getenv("LLM_MODEL", "MiniMax-Text-01") + if provider == "minimax" + else model_id + ) self.llm_model = get_model(provider, m_id) except Exception as e: logger.warning(f"LLM initialization skipped: {e}") @@ -48,39 +58,59 @@ class SentimentTools: # Initialize BERT if needed if self.mode in ["bert", "auto"]: try: - from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification + from transformers import ( + pipeline, + AutoTokenizer, + AutoModelForSequenceClassification, + ) from transformers.utils import logging as transformers_logging - transformers_logging.set_verbosity_error() # 减少冗余日志 - - bert_model = os.getenv("BERT_SENTIMENT_MODEL", "uer/roberta-base-finetuned-chinanews-chinese") - + + transformers_logging.set_verbosity_error() # 减少冗余日志 + + bert_model = os.getenv( + "BERT_SENTIMENT_MODEL", + "uer/roberta-base-finetuned-chinanews-chinese", + ) + # 优先使用本地缓存 try: - tokenizer = AutoTokenizer.from_pretrained(bert_model, local_files_only=True) - model = AutoModelForSequenceClassification.from_pretrained(bert_model, local_files_only=True) - + tokenizer = AutoTokenizer.from_pretrained( + bert_model, local_files_only=True + ) + model = AutoModelForSequenceClassification.from_pretrained( + bert_model, local_files_only=True + ) + self.bert_pipeline = pipeline( - "sentiment-analysis", + "sentiment-analysis", model=model, tokenizer=tokenizer, - device=-1 + device=-1, + ) + logger.info( + f"✅ BERT pipeline loaded from local cache: {bert_model}" ) - logger.info(f"✅ BERT pipeline loaded from local cache: {bert_model}") except (OSError, ValueError, ImportError): # 本地没有,则从网络下载 logger.info(f"📡 Downloading BERT model: {bert_model}...") tokenizer = AutoTokenizer.from_pretrained(bert_model) - model = AutoModelForSequenceClassification.from_pretrained(bert_model) - + model = AutoModelForSequenceClassification.from_pretrained( + bert_model + ) + self.bert_pipeline = pipeline( - "sentiment-analysis", + "sentiment-analysis", model=model, tokenizer=tokenizer, - device=-1 + device=-1, + ) + logger.info( + f"✅ BERT Sentiment pipeline ({bert_model}) initialized." ) - logger.info(f"✅ BERT Sentiment pipeline ({bert_model}) initialized.") except ImportError: - logger.warning("Transformers library not installed. BERT sentiment analysis disabled.") + logger.warning( + "Transformers library not installed. BERT sentiment analysis disabled." + ) except Exception as e: if self.mode == "bert": logger.error(f"BERT mode requested but failed: {e}") @@ -88,14 +118,13 @@ class SentimentTools: logger.warning(f"BERT unavailable, using LLM only. Error: {e}") self.bert_pipeline = None - def analyze_sentiment(self, text: str) -> Dict[str, Union[float, str]]: """ 分析文本的情绪极性。根据初始化时的 mode 自动选择分析方法。 - + Args: text: 需要分析的文本内容,如新闻标题或摘要。 - + Returns: 包含以下字段的字典: - score: 情绪分值,范围 -1.0(极度负面)到 1.0(极度正面),0.0 为中性 @@ -115,10 +144,10 @@ class SentimentTools: def analyze_sentiment_llm(self, text: str) -> Dict[str, Union[float, str]]: """ 使用 LLM 进行深度情绪分析,可获得详细的分析理由。 - + Args: text: 需要分析的文本,最多处理前 1000 字符。 - + Returns: 包含 score, label, reason 的字典。 """ @@ -147,85 +176,112 @@ class SentimentTools: def analyze_sentiment_bert(self, texts: List[str]) -> List[Dict]: """ 使用 BERT 进行批量高速情绪分析。 - + Args: texts: 需要分析的文本列表。 - + Returns: 与输入列表等长的分析结果列表。 """ if not self.bert_pipeline: - return [{"score": 0.0, "label": "error", "reason": "BERT not available"}] * len(texts) - + return [ + {"score": 0.0, "label": "error", "reason": "BERT not available"} + ] * len(texts) + try: results = self.bert_pipeline(texts, truncation=True, max_length=512) processed = [] for r in results: - label = r['label'].lower() - score = r['score'] - + label = r["label"].lower() + score = r["score"] + # 标准化不同模型的标签格式 - if 'negative' in label or 'neg' in label: + if "negative" in label or "neg" in label: score = -score - elif 'neutral' in label or 'neu' in label: + elif "neutral" in label or "neu" in label: score = 0.0 - - processed.append({ - "score": float(round(score, 3)), - "label": "positive" if score > 0.1 else ("negative" if score < -0.1 else "neutral"), - "reason": "BERT automated analysis" - }) + + processed.append( + { + "score": float(round(score, 3)), + "label": "positive" + if score > 0.1 + else ("negative" if score < -0.1 else "neutral"), + "reason": "BERT automated analysis", + } + ) return processed except Exception as e: logger.error(f"BERT analysis failed: {e}") return [{"score": 0.0, "label": "error", "reason": str(e)}] * len(texts) - def batch_update_news_sentiment(self, source: Optional[str] = None, limit: int = 50, use_bert: Optional[bool] = None): + def batch_update_news_sentiment( + self, + source: Optional[str] = None, + limit: int = 50, + use_bert: Optional[bool] = None, + ): """ 批量更新数据库中新闻的情绪分数。 - + Args: source: 筛选特定新闻源,如 "wallstreetcn"。None 则处理所有来源。 limit: 最多处理的新闻数量。 use_bert: 是否使用 BERT。None 则根据初始化模式自动决定。 - + Returns: 成功更新的新闻数量。 """ news_items = self.db.get_daily_news(source=source, limit=limit) - to_analyze = [item for item in news_items if not item.get('sentiment_score')] - + to_analyze = [item for item in news_items if not item.get("sentiment_score")] + if not to_analyze: return 0 # 决定使用哪种方法 - should_use_bert = use_bert if use_bert is not None else (self.bert_pipeline is not None and self.mode != "llm") + should_use_bert = ( + use_bert + if use_bert is not None + else (self.bert_pipeline is not None and self.mode != "llm") + ) updated_count = 0 cursor = self.db.conn.cursor() - + if should_use_bert and self.bert_pipeline: - logger.info(f"🚀 Using BERT for batch analysis of {len(to_analyze)} items...") - titles = [item['title'] for item in to_analyze] + logger.info( + f"🚀 Using BERT for batch analysis of {len(to_analyze)} items..." + ) + titles = [item["title"] for item in to_analyze] results = self.analyze_sentiment_bert(titles) - + for item, analysis in zip(to_analyze, results): - cursor.execute(""" + cursor.execute( + """ UPDATE daily_news SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?) WHERE id = ? - """, (analysis['score'], analysis['reason'], item['id'])) + """, + (analysis["score"], analysis["reason"], item["id"]), + ) updated_count += 1 else: logger.info(f"🚶 Using LLM for analysis of {len(to_analyze)} items...") for item in to_analyze: - analysis = self.analyze_sentiment_llm(item['title']) - cursor.execute(""" + analysis = self.analyze_sentiment_llm(item["title"]) + cursor.execute( + """ UPDATE daily_news SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?) WHERE id = ? - """, (analysis.get('score', 0.0), analysis.get('reason', ''), item['id'])) + """, + ( + analysis.get("score", 0.0), + analysis.get("reason", ""), + item["id"], + ), + ) updated_count += 1 - + self.db.conn.commit() return updated_count diff --git a/skills/alphaear-search/scripts/llm/capability.py b/skills/alphaear-search/scripts/llm/capability.py index 68bc389..6d896be 100644 --- a/skills/alphaear-search/scripts/llm/capability.py +++ b/skills/alphaear-search/scripts/llm/capability.py @@ -77,7 +77,7 @@ if __name__ == "__main__": load_dotenv(os.path.expanduser("~/.config/opencode/.env")) # 测试当前配置的模型 - p = os.getenv("LLM_PROVIDER", "ust") + p = os.getenv("LLM_PROVIDER", "minimax") m = os.getenv("LLM_MODEL", "Qwen") print(f"Testing {p}/{m}...") diff --git a/skills/alphaear-search/scripts/sentiment_tools.py b/skills/alphaear-search/scripts/sentiment_tools.py index 4a84947..f4278b5 100644 --- a/skills/alphaear-search/scripts/sentiment_tools.py +++ b/skills/alphaear-search/scripts/sentiment_tools.py @@ -9,23 +9,29 @@ from .database_manager import DatabaseManager # 从环境变量读取默认情绪分析模式 DEFAULT_SENTIMENT_MODE = os.getenv("SENTIMENT_MODE", "auto") # auto, bert, llm + class SentimentTools: """ 情绪分析工具 - 支持 LLM 和 BERT 两种模式 - + 模式说明: - "auto": 自动选择,优先使用 BERT(速度快),不可用时回退到 LLM - "bert": 强制使用 BERT 模型(需要 transformers 库) - "llm": 强制使用 LLM(更准确但较慢) - + 可通过环境变量 SENTIMENT_MODE 设置默认模式。 """ - - def __init__(self, db: DatabaseManager, mode: Optional[str] = None, - model_provider: str = "openai", model_id: str = "gpt-4o"): + + def __init__( + self, + db: DatabaseManager, + mode: Optional[str] = None, + model_provider: str = "openai", + model_id: str = "gpt-4o", + ): """ 初始化情绪分析工具。 - + Args: db: 数据库管理器实例 mode: 分析模式,可选 "auto", "bert", "llm"。None 则使用环境变量默认值。 @@ -36,11 +42,15 @@ class SentimentTools: self.mode = mode or DEFAULT_SENTIMENT_MODE self.llm_model = None self.bert_pipeline = None - + # Initialize LLM try: - provider = "ust" if os.getenv("UST_KEY_API") else model_provider - m_id = "Qwen" if provider == "ust" else model_id + provider = "minimax" if os.getenv("MINIMAX_API_KEY") else model_provider + m_id = ( + os.getenv("LLM_MODEL", "MiniMax-Text-01") + if provider == "minimax" + else model_id + ) self.llm_model = get_model(provider, m_id) except Exception as e: logger.warning(f"LLM initialization skipped: {e}") @@ -48,39 +58,59 @@ class SentimentTools: # Initialize BERT if needed if self.mode in ["bert", "auto"]: try: - from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification + from transformers import ( + pipeline, + AutoTokenizer, + AutoModelForSequenceClassification, + ) from transformers.utils import logging as transformers_logging - transformers_logging.set_verbosity_error() # 减少冗余日志 - - bert_model = os.getenv("BERT_SENTIMENT_MODEL", "uer/roberta-base-finetuned-chinanews-chinese") - + + transformers_logging.set_verbosity_error() # 减少冗余日志 + + bert_model = os.getenv( + "BERT_SENTIMENT_MODEL", + "uer/roberta-base-finetuned-chinanews-chinese", + ) + # 优先使用本地缓存 try: - tokenizer = AutoTokenizer.from_pretrained(bert_model, local_files_only=True) - model = AutoModelForSequenceClassification.from_pretrained(bert_model, local_files_only=True) - + tokenizer = AutoTokenizer.from_pretrained( + bert_model, local_files_only=True + ) + model = AutoModelForSequenceClassification.from_pretrained( + bert_model, local_files_only=True + ) + self.bert_pipeline = pipeline( - "sentiment-analysis", + "sentiment-analysis", model=model, tokenizer=tokenizer, - device=-1 + device=-1, + ) + logger.info( + f"✅ BERT pipeline loaded from local cache: {bert_model}" ) - logger.info(f"✅ BERT pipeline loaded from local cache: {bert_model}") except (OSError, ValueError, ImportError): # 本地没有,则从网络下载 logger.info(f"📡 Downloading BERT model: {bert_model}...") tokenizer = AutoTokenizer.from_pretrained(bert_model) - model = AutoModelForSequenceClassification.from_pretrained(bert_model) - + model = AutoModelForSequenceClassification.from_pretrained( + bert_model + ) + self.bert_pipeline = pipeline( - "sentiment-analysis", + "sentiment-analysis", model=model, tokenizer=tokenizer, - device=-1 + device=-1, + ) + logger.info( + f"✅ BERT Sentiment pipeline ({bert_model}) initialized." ) - logger.info(f"✅ BERT Sentiment pipeline ({bert_model}) initialized.") except ImportError: - logger.warning("Transformers library not installed. BERT sentiment analysis disabled.") + logger.warning( + "Transformers library not installed. BERT sentiment analysis disabled." + ) except Exception as e: if self.mode == "bert": logger.error(f"BERT mode requested but failed: {e}") @@ -88,14 +118,13 @@ class SentimentTools: logger.warning(f"BERT unavailable, using LLM only. Error: {e}") self.bert_pipeline = None - def analyze_sentiment(self, text: str) -> Dict[str, Union[float, str]]: """ 分析文本的情绪极性。根据初始化时的 mode 自动选择分析方法。 - + Args: text: 需要分析的文本内容,如新闻标题或摘要。 - + Returns: 包含以下字段的字典: - score: 情绪分值,范围 -1.0(极度负面)到 1.0(极度正面),0.0 为中性 @@ -115,10 +144,10 @@ class SentimentTools: def analyze_sentiment_llm(self, text: str) -> Dict[str, Union[float, str]]: """ 使用 LLM 进行深度情绪分析,可获得详细的分析理由。 - + Args: text: 需要分析的文本,最多处理前 1000 字符。 - + Returns: 包含 score, label, reason 的字典。 """ @@ -147,85 +176,112 @@ class SentimentTools: def analyze_sentiment_bert(self, texts: List[str]) -> List[Dict]: """ 使用 BERT 进行批量高速情绪分析。 - + Args: texts: 需要分析的文本列表。 - + Returns: 与输入列表等长的分析结果列表。 """ if not self.bert_pipeline: - return [{"score": 0.0, "label": "error", "reason": "BERT not available"}] * len(texts) - + return [ + {"score": 0.0, "label": "error", "reason": "BERT not available"} + ] * len(texts) + try: results = self.bert_pipeline(texts, truncation=True, max_length=512) processed = [] for r in results: - label = r['label'].lower() - score = r['score'] - + label = r["label"].lower() + score = r["score"] + # 标准化不同模型的标签格式 - if 'negative' in label or 'neg' in label: + if "negative" in label or "neg" in label: score = -score - elif 'neutral' in label or 'neu' in label: + elif "neutral" in label or "neu" in label: score = 0.0 - - processed.append({ - "score": float(round(score, 3)), - "label": "positive" if score > 0.1 else ("negative" if score < -0.1 else "neutral"), - "reason": "BERT automated analysis" - }) + + processed.append( + { + "score": float(round(score, 3)), + "label": "positive" + if score > 0.1 + else ("negative" if score < -0.1 else "neutral"), + "reason": "BERT automated analysis", + } + ) return processed except Exception as e: logger.error(f"BERT analysis failed: {e}") return [{"score": 0.0, "label": "error", "reason": str(e)}] * len(texts) - def batch_update_news_sentiment(self, source: Optional[str] = None, limit: int = 50, use_bert: Optional[bool] = None): + def batch_update_news_sentiment( + self, + source: Optional[str] = None, + limit: int = 50, + use_bert: Optional[bool] = None, + ): """ 批量更新数据库中新闻的情绪分数。 - + Args: source: 筛选特定新闻源,如 "wallstreetcn"。None 则处理所有来源。 limit: 最多处理的新闻数量。 use_bert: 是否使用 BERT。None 则根据初始化模式自动决定。 - + Returns: 成功更新的新闻数量。 """ news_items = self.db.get_daily_news(source=source, limit=limit) - to_analyze = [item for item in news_items if not item.get('sentiment_score')] - + to_analyze = [item for item in news_items if not item.get("sentiment_score")] + if not to_analyze: return 0 # 决定使用哪种方法 - should_use_bert = use_bert if use_bert is not None else (self.bert_pipeline is not None and self.mode != "llm") + should_use_bert = ( + use_bert + if use_bert is not None + else (self.bert_pipeline is not None and self.mode != "llm") + ) updated_count = 0 cursor = self.db.conn.cursor() - + if should_use_bert and self.bert_pipeline: - logger.info(f"🚀 Using BERT for batch analysis of {len(to_analyze)} items...") - titles = [item['title'] for item in to_analyze] + logger.info( + f"🚀 Using BERT for batch analysis of {len(to_analyze)} items..." + ) + titles = [item["title"] for item in to_analyze] results = self.analyze_sentiment_bert(titles) - + for item, analysis in zip(to_analyze, results): - cursor.execute(""" + cursor.execute( + """ UPDATE daily_news SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?) WHERE id = ? - """, (analysis['score'], analysis['reason'], item['id'])) + """, + (analysis["score"], analysis["reason"], item["id"]), + ) updated_count += 1 else: logger.info(f"🚶 Using LLM for analysis of {len(to_analyze)} items...") for item in to_analyze: - analysis = self.analyze_sentiment_llm(item['title']) - cursor.execute(""" + analysis = self.analyze_sentiment_llm(item["title"]) + cursor.execute( + """ UPDATE daily_news SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?) WHERE id = ? - """, (analysis.get('score', 0.0), analysis.get('reason', ''), item['id'])) + """, + ( + analysis.get("score", 0.0), + analysis.get("reason", ""), + item["id"], + ), + ) updated_count += 1 - + self.db.conn.commit() return updated_count diff --git a/skills/alphaear-sentiment/scripts/llm/capability.py b/skills/alphaear-sentiment/scripts/llm/capability.py index 8c1b62c..6f65027 100644 --- a/skills/alphaear-sentiment/scripts/llm/capability.py +++ b/skills/alphaear-sentiment/scripts/llm/capability.py @@ -77,7 +77,7 @@ if __name__ == "__main__": load_dotenv(os.path.expanduser("~/.config/opencode/.env")) # 测试当前配置的模型 - p = os.getenv("LLM_PROVIDER", "ust") + p = os.getenv("LLM_PROVIDER", "minimax") m = os.getenv("LLM_MODEL", "Qwen") print(f"Testing {p}/{m}...") diff --git a/skills/alphaear-signal-tracker/scripts/utils/llm/capability.py b/skills/alphaear-signal-tracker/scripts/utils/llm/capability.py index 60592fd..04c9d5a 100644 --- a/skills/alphaear-signal-tracker/scripts/utils/llm/capability.py +++ b/skills/alphaear-signal-tracker/scripts/utils/llm/capability.py @@ -77,7 +77,7 @@ if __name__ == "__main__": load_dotenv(os.path.expanduser("~/.config/opencode/.env")) # 测试当前配置的模型 - p = os.getenv("LLM_PROVIDER", "ust") + p = os.getenv("LLM_PROVIDER", "minimax") m = os.getenv("LLM_MODEL", "Qwen") print(f"Testing {p}/{m}...") diff --git a/skills/alphaear-signal-tracker/scripts/utils/predictor/training.py b/skills/alphaear-signal-tracker/scripts/utils/predictor/training.py index c305eed..e3c69f3 100644 --- a/skills/alphaear-signal-tracker/scripts/utils/predictor/training.py +++ b/skills/alphaear-signal-tracker/scripts/utils/predictor/training.py @@ -97,7 +97,7 @@ class AutoSynthesisTrainer: self.model.load_state_dict(base_model.state_dict(), strict=False) # LLM for causality verification - provider = os.getenv("LLM_PROVIDER", "ust") + provider = os.getenv("LLM_PROVIDER", "minimax") model_id = os.getenv("LLM_MODEL", "Qwen") self.llm_agent = Agent(model=get_model(provider, model_id)) diff --git a/skills/alphaear-signal-tracker/scripts/utils/search_tools.py b/skills/alphaear-signal-tracker/scripts/utils/search_tools.py index a11d99a..50b08f3 100644 --- a/skills/alphaear-signal-tracker/scripts/utils/search_tools.py +++ b/skills/alphaear-signal-tracker/scripts/utils/search_tools.py @@ -563,7 +563,7 @@ class SearchTools: 4. Return strictly JSON: {{"reuse": true/false, "index": , "reason": "short explanation"}} """ # 初始化模型 - provider = os.getenv("LLM_PROVIDER", "ust") + provider = os.getenv("LLM_PROVIDER", "minimax") model_id = os.getenv("LLM_MODEL", "Qwen") host = os.getenv("LLM_HOST") if host: diff --git a/skills/alphaear-signal-tracker/scripts/utils/sentiment_tools.py b/skills/alphaear-signal-tracker/scripts/utils/sentiment_tools.py index 4a84947..f4278b5 100644 --- a/skills/alphaear-signal-tracker/scripts/utils/sentiment_tools.py +++ b/skills/alphaear-signal-tracker/scripts/utils/sentiment_tools.py @@ -9,23 +9,29 @@ from .database_manager import DatabaseManager # 从环境变量读取默认情绪分析模式 DEFAULT_SENTIMENT_MODE = os.getenv("SENTIMENT_MODE", "auto") # auto, bert, llm + class SentimentTools: """ 情绪分析工具 - 支持 LLM 和 BERT 两种模式 - + 模式说明: - "auto": 自动选择,优先使用 BERT(速度快),不可用时回退到 LLM - "bert": 强制使用 BERT 模型(需要 transformers 库) - "llm": 强制使用 LLM(更准确但较慢) - + 可通过环境变量 SENTIMENT_MODE 设置默认模式。 """ - - def __init__(self, db: DatabaseManager, mode: Optional[str] = None, - model_provider: str = "openai", model_id: str = "gpt-4o"): + + def __init__( + self, + db: DatabaseManager, + mode: Optional[str] = None, + model_provider: str = "openai", + model_id: str = "gpt-4o", + ): """ 初始化情绪分析工具。 - + Args: db: 数据库管理器实例 mode: 分析模式,可选 "auto", "bert", "llm"。None 则使用环境变量默认值。 @@ -36,11 +42,15 @@ class SentimentTools: self.mode = mode or DEFAULT_SENTIMENT_MODE self.llm_model = None self.bert_pipeline = None - + # Initialize LLM try: - provider = "ust" if os.getenv("UST_KEY_API") else model_provider - m_id = "Qwen" if provider == "ust" else model_id + provider = "minimax" if os.getenv("MINIMAX_API_KEY") else model_provider + m_id = ( + os.getenv("LLM_MODEL", "MiniMax-Text-01") + if provider == "minimax" + else model_id + ) self.llm_model = get_model(provider, m_id) except Exception as e: logger.warning(f"LLM initialization skipped: {e}") @@ -48,39 +58,59 @@ class SentimentTools: # Initialize BERT if needed if self.mode in ["bert", "auto"]: try: - from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification + from transformers import ( + pipeline, + AutoTokenizer, + AutoModelForSequenceClassification, + ) from transformers.utils import logging as transformers_logging - transformers_logging.set_verbosity_error() # 减少冗余日志 - - bert_model = os.getenv("BERT_SENTIMENT_MODEL", "uer/roberta-base-finetuned-chinanews-chinese") - + + transformers_logging.set_verbosity_error() # 减少冗余日志 + + bert_model = os.getenv( + "BERT_SENTIMENT_MODEL", + "uer/roberta-base-finetuned-chinanews-chinese", + ) + # 优先使用本地缓存 try: - tokenizer = AutoTokenizer.from_pretrained(bert_model, local_files_only=True) - model = AutoModelForSequenceClassification.from_pretrained(bert_model, local_files_only=True) - + tokenizer = AutoTokenizer.from_pretrained( + bert_model, local_files_only=True + ) + model = AutoModelForSequenceClassification.from_pretrained( + bert_model, local_files_only=True + ) + self.bert_pipeline = pipeline( - "sentiment-analysis", + "sentiment-analysis", model=model, tokenizer=tokenizer, - device=-1 + device=-1, + ) + logger.info( + f"✅ BERT pipeline loaded from local cache: {bert_model}" ) - logger.info(f"✅ BERT pipeline loaded from local cache: {bert_model}") except (OSError, ValueError, ImportError): # 本地没有,则从网络下载 logger.info(f"📡 Downloading BERT model: {bert_model}...") tokenizer = AutoTokenizer.from_pretrained(bert_model) - model = AutoModelForSequenceClassification.from_pretrained(bert_model) - + model = AutoModelForSequenceClassification.from_pretrained( + bert_model + ) + self.bert_pipeline = pipeline( - "sentiment-analysis", + "sentiment-analysis", model=model, tokenizer=tokenizer, - device=-1 + device=-1, + ) + logger.info( + f"✅ BERT Sentiment pipeline ({bert_model}) initialized." ) - logger.info(f"✅ BERT Sentiment pipeline ({bert_model}) initialized.") except ImportError: - logger.warning("Transformers library not installed. BERT sentiment analysis disabled.") + logger.warning( + "Transformers library not installed. BERT sentiment analysis disabled." + ) except Exception as e: if self.mode == "bert": logger.error(f"BERT mode requested but failed: {e}") @@ -88,14 +118,13 @@ class SentimentTools: logger.warning(f"BERT unavailable, using LLM only. Error: {e}") self.bert_pipeline = None - def analyze_sentiment(self, text: str) -> Dict[str, Union[float, str]]: """ 分析文本的情绪极性。根据初始化时的 mode 自动选择分析方法。 - + Args: text: 需要分析的文本内容,如新闻标题或摘要。 - + Returns: 包含以下字段的字典: - score: 情绪分值,范围 -1.0(极度负面)到 1.0(极度正面),0.0 为中性 @@ -115,10 +144,10 @@ class SentimentTools: def analyze_sentiment_llm(self, text: str) -> Dict[str, Union[float, str]]: """ 使用 LLM 进行深度情绪分析,可获得详细的分析理由。 - + Args: text: 需要分析的文本,最多处理前 1000 字符。 - + Returns: 包含 score, label, reason 的字典。 """ @@ -147,85 +176,112 @@ class SentimentTools: def analyze_sentiment_bert(self, texts: List[str]) -> List[Dict]: """ 使用 BERT 进行批量高速情绪分析。 - + Args: texts: 需要分析的文本列表。 - + Returns: 与输入列表等长的分析结果列表。 """ if not self.bert_pipeline: - return [{"score": 0.0, "label": "error", "reason": "BERT not available"}] * len(texts) - + return [ + {"score": 0.0, "label": "error", "reason": "BERT not available"} + ] * len(texts) + try: results = self.bert_pipeline(texts, truncation=True, max_length=512) processed = [] for r in results: - label = r['label'].lower() - score = r['score'] - + label = r["label"].lower() + score = r["score"] + # 标准化不同模型的标签格式 - if 'negative' in label or 'neg' in label: + if "negative" in label or "neg" in label: score = -score - elif 'neutral' in label or 'neu' in label: + elif "neutral" in label or "neu" in label: score = 0.0 - - processed.append({ - "score": float(round(score, 3)), - "label": "positive" if score > 0.1 else ("negative" if score < -0.1 else "neutral"), - "reason": "BERT automated analysis" - }) + + processed.append( + { + "score": float(round(score, 3)), + "label": "positive" + if score > 0.1 + else ("negative" if score < -0.1 else "neutral"), + "reason": "BERT automated analysis", + } + ) return processed except Exception as e: logger.error(f"BERT analysis failed: {e}") return [{"score": 0.0, "label": "error", "reason": str(e)}] * len(texts) - def batch_update_news_sentiment(self, source: Optional[str] = None, limit: int = 50, use_bert: Optional[bool] = None): + def batch_update_news_sentiment( + self, + source: Optional[str] = None, + limit: int = 50, + use_bert: Optional[bool] = None, + ): """ 批量更新数据库中新闻的情绪分数。 - + Args: source: 筛选特定新闻源,如 "wallstreetcn"。None 则处理所有来源。 limit: 最多处理的新闻数量。 use_bert: 是否使用 BERT。None 则根据初始化模式自动决定。 - + Returns: 成功更新的新闻数量。 """ news_items = self.db.get_daily_news(source=source, limit=limit) - to_analyze = [item for item in news_items if not item.get('sentiment_score')] - + to_analyze = [item for item in news_items if not item.get("sentiment_score")] + if not to_analyze: return 0 # 决定使用哪种方法 - should_use_bert = use_bert if use_bert is not None else (self.bert_pipeline is not None and self.mode != "llm") + should_use_bert = ( + use_bert + if use_bert is not None + else (self.bert_pipeline is not None and self.mode != "llm") + ) updated_count = 0 cursor = self.db.conn.cursor() - + if should_use_bert and self.bert_pipeline: - logger.info(f"🚀 Using BERT for batch analysis of {len(to_analyze)} items...") - titles = [item['title'] for item in to_analyze] + logger.info( + f"🚀 Using BERT for batch analysis of {len(to_analyze)} items..." + ) + titles = [item["title"] for item in to_analyze] results = self.analyze_sentiment_bert(titles) - + for item, analysis in zip(to_analyze, results): - cursor.execute(""" + cursor.execute( + """ UPDATE daily_news SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?) WHERE id = ? - """, (analysis['score'], analysis['reason'], item['id'])) + """, + (analysis["score"], analysis["reason"], item["id"]), + ) updated_count += 1 else: logger.info(f"🚶 Using LLM for analysis of {len(to_analyze)} items...") for item in to_analyze: - analysis = self.analyze_sentiment_llm(item['title']) - cursor.execute(""" + analysis = self.analyze_sentiment_llm(item["title"]) + cursor.execute( + """ UPDATE daily_news SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?) WHERE id = ? - """, (analysis.get('score', 0.0), analysis.get('reason', ''), item['id'])) + """, + ( + analysis.get("score", 0.0), + analysis.get("reason", ""), + item["id"], + ), + ) updated_count += 1 - + self.db.conn.commit() return updated_count