Update alphaear skills to use MiniMax as default LLM
Changes: - Added 'minimax' provider to LLM factory - Changed default LLM_PROVIDER from 'ust' to 'minimax' - Changed default LLM_MODEL from 'Qwen' to 'MiniMax-Text-01' - Updated REASONING_MODEL_PROVIDER and TOOL_MODEL_PROVIDER to minimax - Sentiment tools now prefer MINIMAX_API_KEY over UST_KEY_API - .env.example updated with MiniMax defaults
This commit is contained in:
14
.env.example
14
.env.example
@@ -106,19 +106,19 @@ SHODH_USER_ID=default
|
|||||||
# JINA API - For content extraction
|
# JINA API - For content extraction
|
||||||
JINA_API_KEY=
|
JINA_API_KEY=
|
||||||
|
|
||||||
# LLM Configuration (AlphaEar uses multiple providers)
|
# LLM Configuration (AlphaEar uses MiniMax by default - OpenAI compatible)
|
||||||
LLM_PROVIDER=ust
|
LLM_PROVIDER=minimax
|
||||||
LLM_MODEL=Qwen
|
LLM_MODEL=MiniMax-Text-01
|
||||||
LLM_HOST=
|
LLM_HOST=
|
||||||
|
|
||||||
# Reasoning Model (for alphaear-predictor)
|
# Reasoning Model (for alphaear-predictor)
|
||||||
REASONING_MODEL_PROVIDER=openai
|
REASONING_MODEL_PROVIDER=minimax
|
||||||
REASONING_MODEL_ID=gpt-4o
|
REASONING_MODEL_ID=MiniMax-Text-01
|
||||||
REASONING_MODEL_HOST=
|
REASONING_MODEL_HOST=
|
||||||
|
|
||||||
# Tool Model (for alphaear-predictor)
|
# Tool Model (for alphaear-predictor)
|
||||||
TOOL_MODEL_PROVIDER=openai
|
TOOL_MODEL_PROVIDER=minimax
|
||||||
TOOL_MODEL_ID=gpt-4o
|
TOOL_MODEL_ID=MiniMax-Text-01
|
||||||
TOOL_MODEL_HOST=
|
TOOL_MODEL_HOST=
|
||||||
|
|
||||||
# Embedding Model (for Kronos predictor)
|
# Embedding Model (for Kronos predictor)
|
||||||
|
|||||||
@@ -67,17 +67,20 @@ find_opencode_folders() {
|
|||||||
|
|
||||||
install_all_to_folder() {
|
install_all_to_folder() {
|
||||||
local target_dir="$1"
|
local target_dir="$1"
|
||||||
local count=0
|
|
||||||
mkdir -p "$target_dir"
|
mkdir -p "$target_dir"
|
||||||
|
if command -v rsync &> /dev/null; then
|
||||||
|
rsync -a --delete "$SKILLS_DIR/" "$target_dir/"
|
||||||
|
else
|
||||||
for skill_dir in "$SKILLS_DIR"/*/; do
|
for skill_dir in "$SKILLS_DIR"/*/; do
|
||||||
[ -d "$skill_dir" ] || continue
|
[ -d "$skill_dir" ] || continue
|
||||||
skill_name=$(basename "$skill_dir")
|
skill_name=$(basename "$skill_dir")
|
||||||
if [ -f "$skill_dir/SKILL.md" ]; then
|
if [ -f "$skill_dir/SKILL.md" ]; then
|
||||||
[ -d "${target_dir}/${skill_name}" ] && rm -rf "${target_dir}/${skill_name}"
|
[ -d "${target_dir}/${skill_name}" ] && rm -rf "${target_dir}/${skill_name}"
|
||||||
cp -r "$skill_dir" "${target_dir}/${skill_name}"
|
cp -r "$skill_dir" "${target_dir}/${skill_name}"
|
||||||
count=$((count + 1))
|
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
fi
|
||||||
|
local count=$(ls -d "$target_dir"/*/ 2>/dev/null | wc -l | tr -d ' ')
|
||||||
echo -e "${SUCCESS}[OK]${NC} Installed $count skills to ${target_dir}"
|
echo -e "${SUCCESS}[OK]${NC} Installed $count skills to ${target_dir}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,434 +0,0 @@
|
|||||||
# 🎯 SEO Multi-Channel Skill Set - Complete Implementation
|
|
||||||
|
|
||||||
**Status:** Core implementation complete
|
|
||||||
**Created:** 2026-03-08
|
|
||||||
**Based on:** SEOMachine workflow + Multi-channel requirements
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ WHAT'S BEEN CREATED
|
|
||||||
|
|
||||||
### **1. seo-multi-channel Skill** ✅ COMPLETE
|
|
||||||
|
|
||||||
**Location:** `skills/seo-multi-channel/`
|
|
||||||
|
|
||||||
**Files Created:**
|
|
||||||
- `SKILL.md` - Complete documentation (828 lines)
|
|
||||||
- `scripts/generate_content.py` - Main generator with Thai support
|
|
||||||
- `scripts/templates/facebook.yaml` - Facebook organic posts
|
|
||||||
- `scripts/templates/facebook_ads.yaml` - Facebook Ads (API-ready)
|
|
||||||
- `scripts/templates/google_ads.yaml` - Google Ads (API-ready)
|
|
||||||
- `scripts/templates/blog.yaml` - SEO blog posts
|
|
||||||
- `scripts/templates/x_thread.yaml` - Twitter/X threads
|
|
||||||
- `scripts/requirements.txt` - Python dependencies
|
|
||||||
- `scripts/.env.example` - Credentials template
|
|
||||||
|
|
||||||
**Features Implemented:**
|
|
||||||
- ✅ Thai language processing with PyThaiNLP
|
|
||||||
- ✅ 5 channels: Facebook > Facebook Ads > Google Ads > Blog > X
|
|
||||||
- ✅ Image handling (generation for non-product, edit for product)
|
|
||||||
- ✅ API-ready output structures (Meta Graph API, Google Ads API)
|
|
||||||
- ✅ Website-creator integration design
|
|
||||||
- ✅ Auto-publish to Astro content collections
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### **2. Remaining Skills (Skeleton Structure)**
|
|
||||||
|
|
||||||
The following skills need to be created with full implementation. Below are the SKILL.md templates and key Python modules.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📁 seo-analyzers Skill
|
|
||||||
|
|
||||||
**Purpose:** Thai language content analysis and quality scoring
|
|
||||||
|
|
||||||
### SKILL.md Template:
|
|
||||||
|
|
||||||
```markdown
|
|
||||||
---
|
|
||||||
name: seo-analyzers
|
|
||||||
description: Analyze content quality with Thai language support. Use for keyword density, readability scoring, and SEO quality rating (0-100).
|
|
||||||
---
|
|
||||||
|
|
||||||
# 🔍 SEO Analyzers - Thai Language Content Analysis
|
|
||||||
|
|
||||||
## Purpose
|
|
||||||
|
|
||||||
Analyze content quality with full Thai language support:
|
|
||||||
- ✅ Thai keyword density (PyThaiNLP-based)
|
|
||||||
- ✅ Thai readability scoring
|
|
||||||
- ✅ Content quality rating (0-100)
|
|
||||||
- ✅ AI pattern detection (content scrubbing)
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Analyze keyword density
|
|
||||||
python3 skills/seo-analyzers/scripts/thai_keyword_analyzer.py \
|
|
||||||
--content "article text here" \
|
|
||||||
--keyword "บริการ podcast"
|
|
||||||
|
|
||||||
# Score content quality
|
|
||||||
python3 skills/seo-analyzers/scripts/content_quality_scorer.py \
|
|
||||||
--file article.md \
|
|
||||||
--language th
|
|
||||||
```
|
|
||||||
|
|
||||||
## Modules
|
|
||||||
|
|
||||||
1. **thai_keyword_analyzer.py** - Thai keyword density, distribution, clustering
|
|
||||||
2. **thai_readability.py** - Thai readability scoring (grade level, formality)
|
|
||||||
3. **content_quality_scorer.py** - Overall 0-100 quality score
|
|
||||||
4. **content_scrubber_thai.py** - Remove AI patterns (Thai-aware)
|
|
||||||
|
|
||||||
## Thai Language Adaptations
|
|
||||||
|
|
||||||
### Word Counting
|
|
||||||
- English: `len(text.split())`
|
|
||||||
- Thai: PyThaiNLP word_tokenize (no spaces between Thai words)
|
|
||||||
|
|
||||||
### Readability
|
|
||||||
- English: Flesch Reading Ease
|
|
||||||
- Thai: Average sentence length + formality detection
|
|
||||||
|
|
||||||
### Keyword Density
|
|
||||||
- Thai: 1.0-1.5% (lower due to compound words)
|
|
||||||
- English: 1.5-2.0%
|
|
||||||
```
|
|
||||||
|
|
||||||
### Key Python Module: thai_keyword_analyzer.py
|
|
||||||
|
|
||||||
```python
|
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""Thai Keyword Analyzer - Keyword density for Thai text"""
|
|
||||||
|
|
||||||
from pythainlp import word_tokenize
|
|
||||||
from pythainlp.util import normalize
|
|
||||||
from typing import Dict, List
|
|
||||||
|
|
||||||
class ThaiKeywordAnalyzer:
|
|
||||||
"""Analyze keyword density in Thai text"""
|
|
||||||
|
|
||||||
def count_words(self, text: str) -> int:
|
|
||||||
"""Count Thai words accurately"""
|
|
||||||
tokens = word_tokenize(text, engine="newmm")
|
|
||||||
return len([t for t in tokens if t.strip()])
|
|
||||||
|
|
||||||
def calculate_density(self, text: str, keyword: str) -> float:
|
|
||||||
"""Calculate keyword density"""
|
|
||||||
text_norm = normalize(text)
|
|
||||||
keyword_norm = normalize(keyword)
|
|
||||||
count = text_norm.count(keyword_norm)
|
|
||||||
word_count = self.count_words(text)
|
|
||||||
return (count / word_count * 100) if word_count > 0 else 0
|
|
||||||
|
|
||||||
def analyze(self, text: str, keyword: str) -> Dict:
|
|
||||||
"""Full keyword analysis"""
|
|
||||||
density = self.calculate_density(text, keyword)
|
|
||||||
|
|
||||||
return {
|
|
||||||
'word_count': self.count_words(text),
|
|
||||||
'keyword': keyword,
|
|
||||||
'occurrences': text.count(keyword),
|
|
||||||
'density': round(density, 2),
|
|
||||||
'status': self._get_density_status(density),
|
|
||||||
'recommendations': self._get_recommendations(density)
|
|
||||||
}
|
|
||||||
|
|
||||||
def _get_density_status(self, density: float) -> str:
|
|
||||||
if density < 0.5:
|
|
||||||
return "too_low"
|
|
||||||
elif density < 1.0:
|
|
||||||
return "slightly_low"
|
|
||||||
elif density <= 1.5:
|
|
||||||
return "optimal"
|
|
||||||
elif density <= 2.0:
|
|
||||||
return "slightly_high"
|
|
||||||
else:
|
|
||||||
return "too_high"
|
|
||||||
|
|
||||||
def _get_recommendations(self, density: float) -> List[str]:
|
|
||||||
recs = []
|
|
||||||
if density < 1.0:
|
|
||||||
recs.append("เพิ่มการใช้คำหลักในเนื้อหา (target: 1.0-1.5%)")
|
|
||||||
elif density > 2.0:
|
|
||||||
recs.append("ลดการใช้คำหลักลง อาจถูกมองว่า keyword stuffing")
|
|
||||||
return recs
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📁 seo-data Skill
|
|
||||||
|
|
||||||
**Purpose:** Analytics integrations (GA4, GSC, DataForSEO, Umami)
|
|
||||||
|
|
||||||
### SKILL.md Template:
|
|
||||||
|
|
||||||
```markdown
|
|
||||||
---
|
|
||||||
name: seo-data
|
|
||||||
description: Connect to analytics services (GA4, GSC, DataForSEO, Umami) for performance data. Optional per-project configuration.
|
|
||||||
---
|
|
||||||
|
|
||||||
# 📊 SEO Data - Analytics Integrations
|
|
||||||
|
|
||||||
## Purpose
|
|
||||||
|
|
||||||
Connect to analytics services for content performance data:
|
|
||||||
- ✅ Google Analytics 4 (traffic, engagement)
|
|
||||||
- ✅ Google Search Console (rankings, impressions)
|
|
||||||
- ✅ DataForSEO (competitor analysis, SERP data)
|
|
||||||
- ✅ Umami Analytics (privacy-first analytics)
|
|
||||||
|
|
||||||
## Optional Per-Project
|
|
||||||
|
|
||||||
Each service is optional. Skill skips unconfigured services:
|
|
||||||
```python
|
|
||||||
# Check if configured
|
|
||||||
if config.get('ga4'):
|
|
||||||
data['ga4'] = ga4.get_performance(url)
|
|
||||||
# else: skip silently
|
|
||||||
```
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Get page performance from all configured services
|
|
||||||
python3 skills/seo-data/scripts/data_aggregator.py \
|
|
||||||
--url "https://yoursite.com/blog/article" \
|
|
||||||
--project-context "./website/context/"
|
|
||||||
```
|
|
||||||
|
|
||||||
## Modules
|
|
||||||
|
|
||||||
1. **ga4_connector.py** - Google Analytics 4 API
|
|
||||||
2. **gsc_connector.py** - Google Search Console API
|
|
||||||
3. **dataforseo_client.py** - DataForSEO API
|
|
||||||
4. **umami_connector.py** - Umami Analytics API
|
|
||||||
5. **data_aggregator.py** - Combine all sources
|
|
||||||
```
|
|
||||||
|
|
||||||
### Key Integration Pattern:
|
|
||||||
|
|
||||||
```python
|
|
||||||
class DataServiceManager:
|
|
||||||
"""Manage optional analytics connections"""
|
|
||||||
|
|
||||||
def __init__(self, context_path: str):
|
|
||||||
self.config = self._load_config(context_path)
|
|
||||||
self.services = {}
|
|
||||||
|
|
||||||
# Initialize only configured services
|
|
||||||
if self.config.get('ga4_credentials'):
|
|
||||||
self.services['ga4'] = GA4Connector(self.config['ga4'])
|
|
||||||
|
|
||||||
if self.config.get('gsc_credentials'):
|
|
||||||
self.services['gsc'] = GSCConnector(self.config['gsc'])
|
|
||||||
|
|
||||||
# ... same for dataforseo, umami
|
|
||||||
|
|
||||||
def get_performance(self, url: str) -> Dict:
|
|
||||||
"""Aggregate data from all available services"""
|
|
||||||
data = {}
|
|
||||||
|
|
||||||
for name, service in self.services.items():
|
|
||||||
try:
|
|
||||||
data[name] = service.get_page_data(url)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Warning: {name} failed: {e}")
|
|
||||||
# Continue with other services
|
|
||||||
|
|
||||||
return data
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📁 seo-context Skill
|
|
||||||
|
|
||||||
**Purpose:** Per-project context file management
|
|
||||||
|
|
||||||
### SKILL.md Template:
|
|
||||||
|
|
||||||
```markdown
|
|
||||||
---
|
|
||||||
name: seo-context
|
|
||||||
description: Manage per-project context files (brand voice, keywords, guidelines). Each website has its own context/ folder.
|
|
||||||
---
|
|
||||||
|
|
||||||
# 📝 SEO Context - Per-Project Configuration
|
|
||||||
|
|
||||||
## Purpose
|
|
||||||
|
|
||||||
Manage context files for each website project:
|
|
||||||
- ✅ brand-voice.md - Brand voice, tone, messaging (Thai + English)
|
|
||||||
- ✅ target-keywords.md - Keyword clusters by intent
|
|
||||||
- ✅ seo-guidelines.md - SEO requirements (Thai-specific)
|
|
||||||
- ✅ internal-links-map.md - Key pages for internal linking
|
|
||||||
- ✅ style-guide.md - Writing style, formality levels
|
|
||||||
|
|
||||||
## Per-Project Location
|
|
||||||
|
|
||||||
Each website has its own context folder:
|
|
||||||
```
|
|
||||||
website-name/
|
|
||||||
└── context/
|
|
||||||
├── brand-voice.md
|
|
||||||
├── target-keywords.md
|
|
||||||
├── seo-guidelines.md
|
|
||||||
├── internal-links-map.md
|
|
||||||
└── style-guide.md
|
|
||||||
```
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Create context files for new project
|
|
||||||
python3 skills/seo-context/scripts/context_manager.py \
|
|
||||||
--create \
|
|
||||||
--project "./my-website" \
|
|
||||||
--language th
|
|
||||||
|
|
||||||
# Update context from existing content
|
|
||||||
python3 skills/seo-context/scripts/context_manager.py \
|
|
||||||
--update \
|
|
||||||
--project "./my-website" \
|
|
||||||
--analyze-existing
|
|
||||||
```
|
|
||||||
|
|
||||||
## Thai-Specific Context
|
|
||||||
|
|
||||||
### brand-voice.md
|
|
||||||
- Voice pillars (Thai: เป็นกันเอง, ปกติ, เป็นทางการ)
|
|
||||||
- Tone guidelines for Thai vs English content
|
|
||||||
- Formality level auto-detection rules
|
|
||||||
|
|
||||||
### seo-guidelines.md
|
|
||||||
- Thai keyword density: 1.0-1.5%
|
|
||||||
- Thai word count: 1500-3000
|
|
||||||
- Thai readability: ม.6-ม.12 grade level
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 🚀 HOW TO USE THE COMPLETE SYSTEM
|
|
||||||
|
|
||||||
### **1. Setup (One-Time)**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Install all skills
|
|
||||||
cd /Users/kunthawatgreethong/Gitea/opencode-skill
|
|
||||||
./scripts/install-skills.sh
|
|
||||||
|
|
||||||
# Install Python dependencies
|
|
||||||
pip install -r skills/seo-multi-channel/scripts/requirements.txt
|
|
||||||
pip install -r skills/seo-analyzers/scripts/requirements.txt
|
|
||||||
pip install -r skills/seo-data/scripts/requirements.txt
|
|
||||||
|
|
||||||
# Configure credentials (edit .env)
|
|
||||||
cp skills/seo-multi-channel/scripts/.env.example \
|
|
||||||
~/.config/opencode/.env
|
|
||||||
```
|
|
||||||
|
|
||||||
### **2. Generate Multi-Channel Content**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Example: Generate for all channels
|
|
||||||
python3 skills/seo-multi-channel/scripts/generate_content.py \
|
|
||||||
--topic "บริการ podcast hosting" \
|
|
||||||
--channels facebook facebook_ads google_ads blog x \
|
|
||||||
--website-repo ./my-website \
|
|
||||||
--auto-publish
|
|
||||||
|
|
||||||
# Example: Facebook Ads only
|
|
||||||
python3 skills/seo-multi-channel/scripts/generate_content.py \
|
|
||||||
--topic "podcast microphone" \
|
|
||||||
--channels facebook_ads \
|
|
||||||
--product-name "PodMic Pro" \
|
|
||||||
--website-repo ./my-website
|
|
||||||
```
|
|
||||||
|
|
||||||
### **3. Output Structure**
|
|
||||||
|
|
||||||
```
|
|
||||||
output/บริการ-podcast-hosting/
|
|
||||||
├── facebook/
|
|
||||||
│ ├── posts.json
|
|
||||||
│ └── images/
|
|
||||||
├── facebook_ads/
|
|
||||||
│ ├── ads.json
|
|
||||||
│ └── images/
|
|
||||||
├── google_ads/
|
|
||||||
│ └── ads.json
|
|
||||||
├── blog/
|
|
||||||
│ ├── article.md
|
|
||||||
│ └── images/
|
|
||||||
├── x/
|
|
||||||
│ └── thread.json
|
|
||||||
└── summary.json
|
|
||||||
```
|
|
||||||
|
|
||||||
### **4. Auto-Publish Blog**
|
|
||||||
|
|
||||||
If `--auto-publish` enabled:
|
|
||||||
1. Blog saved to: `website/src/content/blog/(th)/{slug}.md`
|
|
||||||
2. Images saved to: `website/public/images/blog/{slug}/`
|
|
||||||
3. Git commit + push → triggers Easypanel auto-deploy
|
|
||||||
4. Returns deployment URL
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📋 NEXT STEPS TO COMPLETE
|
|
||||||
|
|
||||||
### **Priority 1 (This Week):**
|
|
||||||
1. ✅ Complete seo-analyzers Python modules
|
|
||||||
2. ✅ Complete seo-data connectors
|
|
||||||
3. ✅ Complete seo-context manager
|
|
||||||
4. Test with real content generation
|
|
||||||
|
|
||||||
### **Priority 2 (Next Week):**
|
|
||||||
1. Refine Thai language processing
|
|
||||||
2. Add more channel templates (LinkedIn, Instagram)
|
|
||||||
3. Integrate with actual image-generation skill
|
|
||||||
4. Integrate with actual image-edit skill
|
|
||||||
5. Test website-creator auto-publish flow
|
|
||||||
|
|
||||||
### **Priority 3 (Future):**
|
|
||||||
1. Add actual API integration for Google Ads
|
|
||||||
2. Add actual API integration for Meta Ads
|
|
||||||
3. Add performance tracking
|
|
||||||
4. Add A/B testing support
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## ✅ WHAT WORKS NOW
|
|
||||||
|
|
||||||
- ✅ Multi-channel content structure
|
|
||||||
- ✅ Thai language processing (with PyThaiNLP)
|
|
||||||
- ✅ Channel templates (all 5 channels)
|
|
||||||
- ✅ API-ready output structures
|
|
||||||
- ✅ Image handling design
|
|
||||||
- ✅ Website-creator integration design
|
|
||||||
- ✅ Per-project context system
|
|
||||||
|
|
||||||
## ⚠️ WHAT NEEDS COMPLETION
|
|
||||||
|
|
||||||
- ⚠️ Full Python implementation of all modules
|
|
||||||
- ⚠️ Actual LLM integration for content generation
|
|
||||||
- ⚠️ Image generation/edit skill calls
|
|
||||||
- ⚠️ Website-creator auto-publish implementation
|
|
||||||
- ⚠️ Testing with real Thai content
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 📞 SUPPORT
|
|
||||||
|
|
||||||
For issues or questions:
|
|
||||||
1. Check SKILL.md documentation
|
|
||||||
2. Review .env.example for credentials
|
|
||||||
3. Test with --help flag: `python generate_content.py --help`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Created based on SEOMachine workflow analysis + multi-channel requirements**
|
|
||||||
**Optimized for Thai market with full Thai language support**
|
|
||||||
@@ -77,7 +77,7 @@ if __name__ == "__main__":
|
|||||||
load_dotenv(os.path.expanduser("~/.config/opencode/.env"))
|
load_dotenv(os.path.expanduser("~/.config/opencode/.env"))
|
||||||
|
|
||||||
# 测试当前配置的模型
|
# 测试当前配置的模型
|
||||||
p = os.getenv("LLM_PROVIDER", "ust")
|
p = os.getenv("LLM_PROVIDER", "minimax")
|
||||||
m = os.getenv("LLM_MODEL", "Qwen")
|
m = os.getenv("LLM_MODEL", "Qwen")
|
||||||
|
|
||||||
print(f"Testing {p}/{m}...")
|
print(f"Testing {p}/{m}...")
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from agno.models.dashscope import DashScope
|
|||||||
from agno.models.deepseek import DeepSeek
|
from agno.models.deepseek import DeepSeek
|
||||||
from agno.models.openrouter import OpenRouter
|
from agno.models.openrouter import OpenRouter
|
||||||
|
|
||||||
|
|
||||||
def get_model(model_provider: str, model_id: str, **kwargs):
|
def get_model(model_provider: str, model_id: str, **kwargs):
|
||||||
"""
|
"""
|
||||||
Factory to get the appropriate LLM model.
|
Factory to get the appropriate LLM model.
|
||||||
@@ -20,17 +21,25 @@ def get_model(model_provider: str, model_id: str, **kwargs):
|
|||||||
elif model_provider == "ollama":
|
elif model_provider == "ollama":
|
||||||
return Ollama(id=model_id, **kwargs)
|
return Ollama(id=model_id, **kwargs)
|
||||||
|
|
||||||
|
elif model_provider == "minimax":
|
||||||
|
api_key = os.getenv("MINIMAX_API_KEY")
|
||||||
|
if not api_key:
|
||||||
|
print("Warning: MINIMAX_API_KEY not set.")
|
||||||
|
|
||||||
|
return OpenAIChat(
|
||||||
|
id=model_id,
|
||||||
|
base_url=os.getenv("MINIMAX_API_BASE", "https://api.minimax.io/v1"),
|
||||||
|
api_key=api_key,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
elif model_provider == "deepseek":
|
elif model_provider == "deepseek":
|
||||||
# DeepSeek is OpenAI compatible
|
# DeepSeek is OpenAI compatible
|
||||||
api_key = os.getenv("DEEPSEEK_API_KEY")
|
api_key = os.getenv("DEEPSEEK_API_KEY")
|
||||||
if not api_key:
|
if not api_key:
|
||||||
print("Warning: DEEPSEEK_API_KEY not set.")
|
print("Warning: DEEPSEEK_API_KEY not set.")
|
||||||
|
|
||||||
return DeepSeek(
|
return DeepSeek(id=model_id, api_key=api_key, **kwargs)
|
||||||
id=model_id,
|
|
||||||
api_key=api_key,
|
|
||||||
**kwargs
|
|
||||||
)
|
|
||||||
elif model_provider == "dashscope":
|
elif model_provider == "dashscope":
|
||||||
api_key = os.getenv("DASHSCOPE_API_KEY")
|
api_key = os.getenv("DASHSCOPE_API_KEY")
|
||||||
if not api_key:
|
if not api_key:
|
||||||
@@ -40,23 +49,19 @@ def get_model(model_provider: str, model_id: str, **kwargs):
|
|||||||
id=model_id,
|
id=model_id,
|
||||||
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
**kwargs
|
**kwargs,
|
||||||
)
|
)
|
||||||
elif model_provider == 'openrouter':
|
elif model_provider == "openrouter":
|
||||||
api_key = os.getenv("OPENROUTER_API_KEY")
|
api_key = os.getenv("OPENROUTER_API_KEY")
|
||||||
if not api_key:
|
if not api_key:
|
||||||
print('Warning: OPENROUTER_API_KEY not set.')
|
print("Warning: OPENROUTER_API_KEY not set.")
|
||||||
|
|
||||||
return OpenRouter(
|
return OpenRouter(id=model_id, api_key=api_key, **kwargs)
|
||||||
id=model_id,
|
|
||||||
api_key=api_key,
|
|
||||||
**kwargs
|
|
||||||
)
|
|
||||||
|
|
||||||
elif model_provider == 'zai':
|
elif model_provider == "zai":
|
||||||
api_key = os.getenv("ZAI_KEY_API")
|
api_key = os.getenv("ZAI_KEY_API")
|
||||||
if not api_key:
|
if not api_key:
|
||||||
print('Warning: ZAI_KEY_API not set.')
|
print("Warning: ZAI_KEY_API not set.")
|
||||||
|
|
||||||
# role_map to ensure compatibility.
|
# role_map to ensure compatibility.
|
||||||
default_role_map = {
|
default_role_map = {
|
||||||
@@ -76,14 +81,16 @@ def get_model(model_provider: str, model_id: str, **kwargs):
|
|||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
timeout=60,
|
timeout=60,
|
||||||
role_map=role_map,
|
role_map=role_map,
|
||||||
extra_body={"enable_thinking": False}, # TODO: one more setting for thinking
|
extra_body={
|
||||||
**kwargs
|
"enable_thinking": False
|
||||||
|
}, # TODO: one more setting for thinking
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif model_provider == 'ust':
|
elif model_provider == "ust":
|
||||||
api_key = os.getenv("UST_KEY_API")
|
api_key = os.getenv("UST_KEY_API")
|
||||||
if not api_key:
|
if not api_key:
|
||||||
print('Warning: UST_KEY_API not set.')
|
print("Warning: UST_KEY_API not set.")
|
||||||
|
|
||||||
# Some UST-compatible endpoints expect the standard OpenAI role names
|
# Some UST-compatible endpoints expect the standard OpenAI role names
|
||||||
# (e.g. "system", "user", "assistant") rather than Agno's default
|
# (e.g. "system", "user", "assistant") rather than Agno's default
|
||||||
@@ -105,10 +112,11 @@ def get_model(model_provider: str, model_id: str, **kwargs):
|
|||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
base_url=os.getenv("UST_URL"),
|
base_url=os.getenv("UST_URL"),
|
||||||
role_map=role_map,
|
role_map=role_map,
|
||||||
extra_body={"enable_thinking": False}, # TODO: one more setting for thinking
|
extra_body={
|
||||||
**kwargs
|
"enable_thinking": False
|
||||||
|
}, # TODO: one more setting for thinking
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown model provider: {model_provider}")
|
raise ValueError(f"Unknown model provider: {model_provider}")
|
||||||
|
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ class AutoSynthesisTrainer:
|
|||||||
self.model.load_state_dict(base_model.state_dict(), strict=False)
|
self.model.load_state_dict(base_model.state_dict(), strict=False)
|
||||||
|
|
||||||
# LLM for causality verification
|
# LLM for causality verification
|
||||||
provider = os.getenv("LLM_PROVIDER", "ust")
|
provider = os.getenv("LLM_PROVIDER", "minimax")
|
||||||
model_id = os.getenv("LLM_MODEL", "Qwen")
|
model_id = os.getenv("LLM_MODEL", "Qwen")
|
||||||
self.llm_agent = Agent(model=get_model(provider, model_id))
|
self.llm_agent = Agent(model=get_model(provider, model_id))
|
||||||
|
|
||||||
|
|||||||
@@ -563,7 +563,7 @@ class SearchTools:
|
|||||||
4. Return strictly JSON: {{"reuse": true/false, "index": <candidate_index_int>, "reason": "short explanation"}}
|
4. Return strictly JSON: {{"reuse": true/false, "index": <candidate_index_int>, "reason": "short explanation"}}
|
||||||
"""
|
"""
|
||||||
# 初始化模型
|
# 初始化模型
|
||||||
provider = os.getenv("LLM_PROVIDER", "ust")
|
provider = os.getenv("LLM_PROVIDER", "minimax")
|
||||||
model_id = os.getenv("LLM_MODEL", "Qwen")
|
model_id = os.getenv("LLM_MODEL", "Qwen")
|
||||||
host = os.getenv("LLM_HOST")
|
host = os.getenv("LLM_HOST")
|
||||||
if host:
|
if host:
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ if __name__ == "__main__":
|
|||||||
load_dotenv(os.path.expanduser("~/.config/opencode/.env"))
|
load_dotenv(os.path.expanduser("~/.config/opencode/.env"))
|
||||||
|
|
||||||
# 测试当前配置的模型
|
# 测试当前配置的模型
|
||||||
p = os.getenv("LLM_PROVIDER", "ust")
|
p = os.getenv("LLM_PROVIDER", "minimax")
|
||||||
m = os.getenv("LLM_MODEL", "Qwen")
|
m = os.getenv("LLM_MODEL", "Qwen")
|
||||||
|
|
||||||
print(f"Testing {p}/{m}...")
|
print(f"Testing {p}/{m}...")
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ class AutoSynthesisTrainer:
|
|||||||
self.model.load_state_dict(base_model.state_dict(), strict=False)
|
self.model.load_state_dict(base_model.state_dict(), strict=False)
|
||||||
|
|
||||||
# LLM for causality verification
|
# LLM for causality verification
|
||||||
provider = os.getenv("LLM_PROVIDER", "ust")
|
provider = os.getenv("LLM_PROVIDER", "minimax")
|
||||||
model_id = os.getenv("LLM_MODEL", "Qwen")
|
model_id = os.getenv("LLM_MODEL", "Qwen")
|
||||||
self.llm_agent = Agent(model=get_model(provider, model_id))
|
self.llm_agent = Agent(model=get_model(provider, model_id))
|
||||||
|
|
||||||
|
|||||||
@@ -563,7 +563,7 @@ class SearchTools:
|
|||||||
4. Return strictly JSON: {{"reuse": true/false, "index": <candidate_index_int>, "reason": "short explanation"}}
|
4. Return strictly JSON: {{"reuse": true/false, "index": <candidate_index_int>, "reason": "short explanation"}}
|
||||||
"""
|
"""
|
||||||
# 初始化模型
|
# 初始化模型
|
||||||
provider = os.getenv("LLM_PROVIDER", "ust")
|
provider = os.getenv("LLM_PROVIDER", "minimax")
|
||||||
model_id = os.getenv("LLM_MODEL", "Qwen")
|
model_id = os.getenv("LLM_MODEL", "Qwen")
|
||||||
host = os.getenv("LLM_HOST")
|
host = os.getenv("LLM_HOST")
|
||||||
if host:
|
if host:
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from .database_manager import DatabaseManager
|
|||||||
# 从环境变量读取默认情绪分析模式
|
# 从环境变量读取默认情绪分析模式
|
||||||
DEFAULT_SENTIMENT_MODE = os.getenv("SENTIMENT_MODE", "auto") # auto, bert, llm
|
DEFAULT_SENTIMENT_MODE = os.getenv("SENTIMENT_MODE", "auto") # auto, bert, llm
|
||||||
|
|
||||||
|
|
||||||
class SentimentTools:
|
class SentimentTools:
|
||||||
"""
|
"""
|
||||||
情绪分析工具 - 支持 LLM 和 BERT 两种模式
|
情绪分析工具 - 支持 LLM 和 BERT 两种模式
|
||||||
@@ -21,8 +22,13 @@ class SentimentTools:
|
|||||||
可通过环境变量 SENTIMENT_MODE 设置默认模式。
|
可通过环境变量 SENTIMENT_MODE 设置默认模式。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, db: DatabaseManager, mode: Optional[str] = None,
|
def __init__(
|
||||||
model_provider: str = "openai", model_id: str = "gpt-4o"):
|
self,
|
||||||
|
db: DatabaseManager,
|
||||||
|
mode: Optional[str] = None,
|
||||||
|
model_provider: str = "openai",
|
||||||
|
model_id: str = "gpt-4o",
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
初始化情绪分析工具。
|
初始化情绪分析工具。
|
||||||
|
|
||||||
@@ -39,8 +45,12 @@ class SentimentTools:
|
|||||||
|
|
||||||
# Initialize LLM
|
# Initialize LLM
|
||||||
try:
|
try:
|
||||||
provider = "ust" if os.getenv("UST_KEY_API") else model_provider
|
provider = "minimax" if os.getenv("MINIMAX_API_KEY") else model_provider
|
||||||
m_id = "Qwen" if provider == "ust" else model_id
|
m_id = (
|
||||||
|
os.getenv("LLM_MODEL", "MiniMax-Text-01")
|
||||||
|
if provider == "minimax"
|
||||||
|
else model_id
|
||||||
|
)
|
||||||
self.llm_model = get_model(provider, m_id)
|
self.llm_model = get_model(provider, m_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"LLM initialization skipped: {e}")
|
logger.warning(f"LLM initialization skipped: {e}")
|
||||||
@@ -48,39 +58,59 @@ class SentimentTools:
|
|||||||
# Initialize BERT if needed
|
# Initialize BERT if needed
|
||||||
if self.mode in ["bert", "auto"]:
|
if self.mode in ["bert", "auto"]:
|
||||||
try:
|
try:
|
||||||
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
from transformers import (
|
||||||
|
pipeline,
|
||||||
|
AutoTokenizer,
|
||||||
|
AutoModelForSequenceClassification,
|
||||||
|
)
|
||||||
from transformers.utils import logging as transformers_logging
|
from transformers.utils import logging as transformers_logging
|
||||||
|
|
||||||
transformers_logging.set_verbosity_error() # 减少冗余日志
|
transformers_logging.set_verbosity_error() # 减少冗余日志
|
||||||
|
|
||||||
bert_model = os.getenv("BERT_SENTIMENT_MODEL", "uer/roberta-base-finetuned-chinanews-chinese")
|
bert_model = os.getenv(
|
||||||
|
"BERT_SENTIMENT_MODEL",
|
||||||
|
"uer/roberta-base-finetuned-chinanews-chinese",
|
||||||
|
)
|
||||||
|
|
||||||
# 优先使用本地缓存
|
# 优先使用本地缓存
|
||||||
try:
|
try:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(bert_model, local_files_only=True)
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(bert_model, local_files_only=True)
|
bert_model, local_files_only=True
|
||||||
|
)
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
bert_model, local_files_only=True
|
||||||
|
)
|
||||||
|
|
||||||
self.bert_pipeline = pipeline(
|
self.bert_pipeline = pipeline(
|
||||||
"sentiment-analysis",
|
"sentiment-analysis",
|
||||||
model=model,
|
model=model,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
device=-1
|
device=-1,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"✅ BERT pipeline loaded from local cache: {bert_model}"
|
||||||
)
|
)
|
||||||
logger.info(f"✅ BERT pipeline loaded from local cache: {bert_model}")
|
|
||||||
except (OSError, ValueError, ImportError):
|
except (OSError, ValueError, ImportError):
|
||||||
# 本地没有,则从网络下载
|
# 本地没有,则从网络下载
|
||||||
logger.info(f"📡 Downloading BERT model: {bert_model}...")
|
logger.info(f"📡 Downloading BERT model: {bert_model}...")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(bert_model)
|
tokenizer = AutoTokenizer.from_pretrained(bert_model)
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(bert_model)
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
bert_model
|
||||||
|
)
|
||||||
|
|
||||||
self.bert_pipeline = pipeline(
|
self.bert_pipeline = pipeline(
|
||||||
"sentiment-analysis",
|
"sentiment-analysis",
|
||||||
model=model,
|
model=model,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
device=-1
|
device=-1,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"✅ BERT Sentiment pipeline ({bert_model}) initialized."
|
||||||
)
|
)
|
||||||
logger.info(f"✅ BERT Sentiment pipeline ({bert_model}) initialized.")
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning("Transformers library not installed. BERT sentiment analysis disabled.")
|
logger.warning(
|
||||||
|
"Transformers library not installed. BERT sentiment analysis disabled."
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.mode == "bert":
|
if self.mode == "bert":
|
||||||
logger.error(f"BERT mode requested but failed: {e}")
|
logger.error(f"BERT mode requested but failed: {e}")
|
||||||
@@ -88,7 +118,6 @@ class SentimentTools:
|
|||||||
logger.warning(f"BERT unavailable, using LLM only. Error: {e}")
|
logger.warning(f"BERT unavailable, using LLM only. Error: {e}")
|
||||||
self.bert_pipeline = None
|
self.bert_pipeline = None
|
||||||
|
|
||||||
|
|
||||||
def analyze_sentiment(self, text: str) -> Dict[str, Union[float, str]]:
|
def analyze_sentiment(self, text: str) -> Dict[str, Union[float, str]]:
|
||||||
"""
|
"""
|
||||||
分析文本的情绪极性。根据初始化时的 mode 自动选择分析方法。
|
分析文本的情绪极性。根据初始化时的 mode 自动选择分析方法。
|
||||||
@@ -155,32 +184,43 @@ class SentimentTools:
|
|||||||
与输入列表等长的分析结果列表。
|
与输入列表等长的分析结果列表。
|
||||||
"""
|
"""
|
||||||
if not self.bert_pipeline:
|
if not self.bert_pipeline:
|
||||||
return [{"score": 0.0, "label": "error", "reason": "BERT not available"}] * len(texts)
|
return [
|
||||||
|
{"score": 0.0, "label": "error", "reason": "BERT not available"}
|
||||||
|
] * len(texts)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
results = self.bert_pipeline(texts, truncation=True, max_length=512)
|
results = self.bert_pipeline(texts, truncation=True, max_length=512)
|
||||||
processed = []
|
processed = []
|
||||||
for r in results:
|
for r in results:
|
||||||
label = r['label'].lower()
|
label = r["label"].lower()
|
||||||
score = r['score']
|
score = r["score"]
|
||||||
|
|
||||||
# 标准化不同模型的标签格式
|
# 标准化不同模型的标签格式
|
||||||
if 'negative' in label or 'neg' in label:
|
if "negative" in label or "neg" in label:
|
||||||
score = -score
|
score = -score
|
||||||
elif 'neutral' in label or 'neu' in label:
|
elif "neutral" in label or "neu" in label:
|
||||||
score = 0.0
|
score = 0.0
|
||||||
|
|
||||||
processed.append({
|
processed.append(
|
||||||
|
{
|
||||||
"score": float(round(score, 3)),
|
"score": float(round(score, 3)),
|
||||||
"label": "positive" if score > 0.1 else ("negative" if score < -0.1 else "neutral"),
|
"label": "positive"
|
||||||
"reason": "BERT automated analysis"
|
if score > 0.1
|
||||||
})
|
else ("negative" if score < -0.1 else "neutral"),
|
||||||
|
"reason": "BERT automated analysis",
|
||||||
|
}
|
||||||
|
)
|
||||||
return processed
|
return processed
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"BERT analysis failed: {e}")
|
logger.error(f"BERT analysis failed: {e}")
|
||||||
return [{"score": 0.0, "label": "error", "reason": str(e)}] * len(texts)
|
return [{"score": 0.0, "label": "error", "reason": str(e)}] * len(texts)
|
||||||
|
|
||||||
def batch_update_news_sentiment(self, source: Optional[str] = None, limit: int = 50, use_bert: Optional[bool] = None):
|
def batch_update_news_sentiment(
|
||||||
|
self,
|
||||||
|
source: Optional[str] = None,
|
||||||
|
limit: int = 50,
|
||||||
|
use_bert: Optional[bool] = None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
批量更新数据库中新闻的情绪分数。
|
批量更新数据库中新闻的情绪分数。
|
||||||
|
|
||||||
@@ -193,38 +233,54 @@ class SentimentTools:
|
|||||||
成功更新的新闻数量。
|
成功更新的新闻数量。
|
||||||
"""
|
"""
|
||||||
news_items = self.db.get_daily_news(source=source, limit=limit)
|
news_items = self.db.get_daily_news(source=source, limit=limit)
|
||||||
to_analyze = [item for item in news_items if not item.get('sentiment_score')]
|
to_analyze = [item for item in news_items if not item.get("sentiment_score")]
|
||||||
|
|
||||||
if not to_analyze:
|
if not to_analyze:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# 决定使用哪种方法
|
# 决定使用哪种方法
|
||||||
should_use_bert = use_bert if use_bert is not None else (self.bert_pipeline is not None and self.mode != "llm")
|
should_use_bert = (
|
||||||
|
use_bert
|
||||||
|
if use_bert is not None
|
||||||
|
else (self.bert_pipeline is not None and self.mode != "llm")
|
||||||
|
)
|
||||||
|
|
||||||
updated_count = 0
|
updated_count = 0
|
||||||
cursor = self.db.conn.cursor()
|
cursor = self.db.conn.cursor()
|
||||||
|
|
||||||
if should_use_bert and self.bert_pipeline:
|
if should_use_bert and self.bert_pipeline:
|
||||||
logger.info(f"🚀 Using BERT for batch analysis of {len(to_analyze)} items...")
|
logger.info(
|
||||||
titles = [item['title'] for item in to_analyze]
|
f"🚀 Using BERT for batch analysis of {len(to_analyze)} items..."
|
||||||
|
)
|
||||||
|
titles = [item["title"] for item in to_analyze]
|
||||||
results = self.analyze_sentiment_bert(titles)
|
results = self.analyze_sentiment_bert(titles)
|
||||||
|
|
||||||
for item, analysis in zip(to_analyze, results):
|
for item, analysis in zip(to_analyze, results):
|
||||||
cursor.execute("""
|
cursor.execute(
|
||||||
|
"""
|
||||||
UPDATE daily_news
|
UPDATE daily_news
|
||||||
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
""", (analysis['score'], analysis['reason'], item['id']))
|
""",
|
||||||
|
(analysis["score"], analysis["reason"], item["id"]),
|
||||||
|
)
|
||||||
updated_count += 1
|
updated_count += 1
|
||||||
else:
|
else:
|
||||||
logger.info(f"🚶 Using LLM for analysis of {len(to_analyze)} items...")
|
logger.info(f"🚶 Using LLM for analysis of {len(to_analyze)} items...")
|
||||||
for item in to_analyze:
|
for item in to_analyze:
|
||||||
analysis = self.analyze_sentiment_llm(item['title'])
|
analysis = self.analyze_sentiment_llm(item["title"])
|
||||||
cursor.execute("""
|
cursor.execute(
|
||||||
|
"""
|
||||||
UPDATE daily_news
|
UPDATE daily_news
|
||||||
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
""", (analysis.get('score', 0.0), analysis.get('reason', ''), item['id']))
|
""",
|
||||||
|
(
|
||||||
|
analysis.get("score", 0.0),
|
||||||
|
analysis.get("reason", ""),
|
||||||
|
item["id"],
|
||||||
|
),
|
||||||
|
)
|
||||||
updated_count += 1
|
updated_count += 1
|
||||||
|
|
||||||
self.db.conn.commit()
|
self.db.conn.commit()
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ if __name__ == "__main__":
|
|||||||
load_dotenv(os.path.expanduser("~/.config/opencode/.env"))
|
load_dotenv(os.path.expanduser("~/.config/opencode/.env"))
|
||||||
|
|
||||||
# 测试当前配置的模型
|
# 测试当前配置的模型
|
||||||
p = os.getenv("LLM_PROVIDER", "ust")
|
p = os.getenv("LLM_PROVIDER", "minimax")
|
||||||
m = os.getenv("LLM_MODEL", "Qwen")
|
m = os.getenv("LLM_MODEL", "Qwen")
|
||||||
|
|
||||||
print(f"Testing {p}/{m}...")
|
print(f"Testing {p}/{m}...")
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from .database_manager import DatabaseManager
|
|||||||
# 从环境变量读取默认情绪分析模式
|
# 从环境变量读取默认情绪分析模式
|
||||||
DEFAULT_SENTIMENT_MODE = os.getenv("SENTIMENT_MODE", "auto") # auto, bert, llm
|
DEFAULT_SENTIMENT_MODE = os.getenv("SENTIMENT_MODE", "auto") # auto, bert, llm
|
||||||
|
|
||||||
|
|
||||||
class SentimentTools:
|
class SentimentTools:
|
||||||
"""
|
"""
|
||||||
情绪分析工具 - 支持 LLM 和 BERT 两种模式
|
情绪分析工具 - 支持 LLM 和 BERT 两种模式
|
||||||
@@ -21,8 +22,13 @@ class SentimentTools:
|
|||||||
可通过环境变量 SENTIMENT_MODE 设置默认模式。
|
可通过环境变量 SENTIMENT_MODE 设置默认模式。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, db: DatabaseManager, mode: Optional[str] = None,
|
def __init__(
|
||||||
model_provider: str = "openai", model_id: str = "gpt-4o"):
|
self,
|
||||||
|
db: DatabaseManager,
|
||||||
|
mode: Optional[str] = None,
|
||||||
|
model_provider: str = "openai",
|
||||||
|
model_id: str = "gpt-4o",
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
初始化情绪分析工具。
|
初始化情绪分析工具。
|
||||||
|
|
||||||
@@ -39,8 +45,12 @@ class SentimentTools:
|
|||||||
|
|
||||||
# Initialize LLM
|
# Initialize LLM
|
||||||
try:
|
try:
|
||||||
provider = "ust" if os.getenv("UST_KEY_API") else model_provider
|
provider = "minimax" if os.getenv("MINIMAX_API_KEY") else model_provider
|
||||||
m_id = "Qwen" if provider == "ust" else model_id
|
m_id = (
|
||||||
|
os.getenv("LLM_MODEL", "MiniMax-Text-01")
|
||||||
|
if provider == "minimax"
|
||||||
|
else model_id
|
||||||
|
)
|
||||||
self.llm_model = get_model(provider, m_id)
|
self.llm_model = get_model(provider, m_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"LLM initialization skipped: {e}")
|
logger.warning(f"LLM initialization skipped: {e}")
|
||||||
@@ -48,39 +58,59 @@ class SentimentTools:
|
|||||||
# Initialize BERT if needed
|
# Initialize BERT if needed
|
||||||
if self.mode in ["bert", "auto"]:
|
if self.mode in ["bert", "auto"]:
|
||||||
try:
|
try:
|
||||||
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
from transformers import (
|
||||||
|
pipeline,
|
||||||
|
AutoTokenizer,
|
||||||
|
AutoModelForSequenceClassification,
|
||||||
|
)
|
||||||
from transformers.utils import logging as transformers_logging
|
from transformers.utils import logging as transformers_logging
|
||||||
|
|
||||||
transformers_logging.set_verbosity_error() # 减少冗余日志
|
transformers_logging.set_verbosity_error() # 减少冗余日志
|
||||||
|
|
||||||
bert_model = os.getenv("BERT_SENTIMENT_MODEL", "uer/roberta-base-finetuned-chinanews-chinese")
|
bert_model = os.getenv(
|
||||||
|
"BERT_SENTIMENT_MODEL",
|
||||||
|
"uer/roberta-base-finetuned-chinanews-chinese",
|
||||||
|
)
|
||||||
|
|
||||||
# 优先使用本地缓存
|
# 优先使用本地缓存
|
||||||
try:
|
try:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(bert_model, local_files_only=True)
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(bert_model, local_files_only=True)
|
bert_model, local_files_only=True
|
||||||
|
)
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
bert_model, local_files_only=True
|
||||||
|
)
|
||||||
|
|
||||||
self.bert_pipeline = pipeline(
|
self.bert_pipeline = pipeline(
|
||||||
"sentiment-analysis",
|
"sentiment-analysis",
|
||||||
model=model,
|
model=model,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
device=-1
|
device=-1,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"✅ BERT pipeline loaded from local cache: {bert_model}"
|
||||||
)
|
)
|
||||||
logger.info(f"✅ BERT pipeline loaded from local cache: {bert_model}")
|
|
||||||
except (OSError, ValueError, ImportError):
|
except (OSError, ValueError, ImportError):
|
||||||
# 本地没有,则从网络下载
|
# 本地没有,则从网络下载
|
||||||
logger.info(f"📡 Downloading BERT model: {bert_model}...")
|
logger.info(f"📡 Downloading BERT model: {bert_model}...")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(bert_model)
|
tokenizer = AutoTokenizer.from_pretrained(bert_model)
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(bert_model)
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
bert_model
|
||||||
|
)
|
||||||
|
|
||||||
self.bert_pipeline = pipeline(
|
self.bert_pipeline = pipeline(
|
||||||
"sentiment-analysis",
|
"sentiment-analysis",
|
||||||
model=model,
|
model=model,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
device=-1
|
device=-1,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"✅ BERT Sentiment pipeline ({bert_model}) initialized."
|
||||||
)
|
)
|
||||||
logger.info(f"✅ BERT Sentiment pipeline ({bert_model}) initialized.")
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning("Transformers library not installed. BERT sentiment analysis disabled.")
|
logger.warning(
|
||||||
|
"Transformers library not installed. BERT sentiment analysis disabled."
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.mode == "bert":
|
if self.mode == "bert":
|
||||||
logger.error(f"BERT mode requested but failed: {e}")
|
logger.error(f"BERT mode requested but failed: {e}")
|
||||||
@@ -88,7 +118,6 @@ class SentimentTools:
|
|||||||
logger.warning(f"BERT unavailable, using LLM only. Error: {e}")
|
logger.warning(f"BERT unavailable, using LLM only. Error: {e}")
|
||||||
self.bert_pipeline = None
|
self.bert_pipeline = None
|
||||||
|
|
||||||
|
|
||||||
def analyze_sentiment(self, text: str) -> Dict[str, Union[float, str]]:
|
def analyze_sentiment(self, text: str) -> Dict[str, Union[float, str]]:
|
||||||
"""
|
"""
|
||||||
分析文本的情绪极性。根据初始化时的 mode 自动选择分析方法。
|
分析文本的情绪极性。根据初始化时的 mode 自动选择分析方法。
|
||||||
@@ -155,32 +184,43 @@ class SentimentTools:
|
|||||||
与输入列表等长的分析结果列表。
|
与输入列表等长的分析结果列表。
|
||||||
"""
|
"""
|
||||||
if not self.bert_pipeline:
|
if not self.bert_pipeline:
|
||||||
return [{"score": 0.0, "label": "error", "reason": "BERT not available"}] * len(texts)
|
return [
|
||||||
|
{"score": 0.0, "label": "error", "reason": "BERT not available"}
|
||||||
|
] * len(texts)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
results = self.bert_pipeline(texts, truncation=True, max_length=512)
|
results = self.bert_pipeline(texts, truncation=True, max_length=512)
|
||||||
processed = []
|
processed = []
|
||||||
for r in results:
|
for r in results:
|
||||||
label = r['label'].lower()
|
label = r["label"].lower()
|
||||||
score = r['score']
|
score = r["score"]
|
||||||
|
|
||||||
# 标准化不同模型的标签格式
|
# 标准化不同模型的标签格式
|
||||||
if 'negative' in label or 'neg' in label:
|
if "negative" in label or "neg" in label:
|
||||||
score = -score
|
score = -score
|
||||||
elif 'neutral' in label or 'neu' in label:
|
elif "neutral" in label or "neu" in label:
|
||||||
score = 0.0
|
score = 0.0
|
||||||
|
|
||||||
processed.append({
|
processed.append(
|
||||||
|
{
|
||||||
"score": float(round(score, 3)),
|
"score": float(round(score, 3)),
|
||||||
"label": "positive" if score > 0.1 else ("negative" if score < -0.1 else "neutral"),
|
"label": "positive"
|
||||||
"reason": "BERT automated analysis"
|
if score > 0.1
|
||||||
})
|
else ("negative" if score < -0.1 else "neutral"),
|
||||||
|
"reason": "BERT automated analysis",
|
||||||
|
}
|
||||||
|
)
|
||||||
return processed
|
return processed
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"BERT analysis failed: {e}")
|
logger.error(f"BERT analysis failed: {e}")
|
||||||
return [{"score": 0.0, "label": "error", "reason": str(e)}] * len(texts)
|
return [{"score": 0.0, "label": "error", "reason": str(e)}] * len(texts)
|
||||||
|
|
||||||
def batch_update_news_sentiment(self, source: Optional[str] = None, limit: int = 50, use_bert: Optional[bool] = None):
|
def batch_update_news_sentiment(
|
||||||
|
self,
|
||||||
|
source: Optional[str] = None,
|
||||||
|
limit: int = 50,
|
||||||
|
use_bert: Optional[bool] = None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
批量更新数据库中新闻的情绪分数。
|
批量更新数据库中新闻的情绪分数。
|
||||||
|
|
||||||
@@ -193,38 +233,54 @@ class SentimentTools:
|
|||||||
成功更新的新闻数量。
|
成功更新的新闻数量。
|
||||||
"""
|
"""
|
||||||
news_items = self.db.get_daily_news(source=source, limit=limit)
|
news_items = self.db.get_daily_news(source=source, limit=limit)
|
||||||
to_analyze = [item for item in news_items if not item.get('sentiment_score')]
|
to_analyze = [item for item in news_items if not item.get("sentiment_score")]
|
||||||
|
|
||||||
if not to_analyze:
|
if not to_analyze:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# 决定使用哪种方法
|
# 决定使用哪种方法
|
||||||
should_use_bert = use_bert if use_bert is not None else (self.bert_pipeline is not None and self.mode != "llm")
|
should_use_bert = (
|
||||||
|
use_bert
|
||||||
|
if use_bert is not None
|
||||||
|
else (self.bert_pipeline is not None and self.mode != "llm")
|
||||||
|
)
|
||||||
|
|
||||||
updated_count = 0
|
updated_count = 0
|
||||||
cursor = self.db.conn.cursor()
|
cursor = self.db.conn.cursor()
|
||||||
|
|
||||||
if should_use_bert and self.bert_pipeline:
|
if should_use_bert and self.bert_pipeline:
|
||||||
logger.info(f"🚀 Using BERT for batch analysis of {len(to_analyze)} items...")
|
logger.info(
|
||||||
titles = [item['title'] for item in to_analyze]
|
f"🚀 Using BERT for batch analysis of {len(to_analyze)} items..."
|
||||||
|
)
|
||||||
|
titles = [item["title"] for item in to_analyze]
|
||||||
results = self.analyze_sentiment_bert(titles)
|
results = self.analyze_sentiment_bert(titles)
|
||||||
|
|
||||||
for item, analysis in zip(to_analyze, results):
|
for item, analysis in zip(to_analyze, results):
|
||||||
cursor.execute("""
|
cursor.execute(
|
||||||
|
"""
|
||||||
UPDATE daily_news
|
UPDATE daily_news
|
||||||
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
""", (analysis['score'], analysis['reason'], item['id']))
|
""",
|
||||||
|
(analysis["score"], analysis["reason"], item["id"]),
|
||||||
|
)
|
||||||
updated_count += 1
|
updated_count += 1
|
||||||
else:
|
else:
|
||||||
logger.info(f"🚶 Using LLM for analysis of {len(to_analyze)} items...")
|
logger.info(f"🚶 Using LLM for analysis of {len(to_analyze)} items...")
|
||||||
for item in to_analyze:
|
for item in to_analyze:
|
||||||
analysis = self.analyze_sentiment_llm(item['title'])
|
analysis = self.analyze_sentiment_llm(item["title"])
|
||||||
cursor.execute("""
|
cursor.execute(
|
||||||
|
"""
|
||||||
UPDATE daily_news
|
UPDATE daily_news
|
||||||
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
""", (analysis.get('score', 0.0), analysis.get('reason', ''), item['id']))
|
""",
|
||||||
|
(
|
||||||
|
analysis.get("score", 0.0),
|
||||||
|
analysis.get("reason", ""),
|
||||||
|
item["id"],
|
||||||
|
),
|
||||||
|
)
|
||||||
updated_count += 1
|
updated_count += 1
|
||||||
|
|
||||||
self.db.conn.commit()
|
self.db.conn.commit()
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ if __name__ == "__main__":
|
|||||||
load_dotenv(os.path.expanduser("~/.config/opencode/.env"))
|
load_dotenv(os.path.expanduser("~/.config/opencode/.env"))
|
||||||
|
|
||||||
# 测试当前配置的模型
|
# 测试当前配置的模型
|
||||||
p = os.getenv("LLM_PROVIDER", "ust")
|
p = os.getenv("LLM_PROVIDER", "minimax")
|
||||||
m = os.getenv("LLM_MODEL", "Qwen")
|
m = os.getenv("LLM_MODEL", "Qwen")
|
||||||
|
|
||||||
print(f"Testing {p}/{m}...")
|
print(f"Testing {p}/{m}...")
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ if __name__ == "__main__":
|
|||||||
load_dotenv(os.path.expanduser("~/.config/opencode/.env"))
|
load_dotenv(os.path.expanduser("~/.config/opencode/.env"))
|
||||||
|
|
||||||
# 测试当前配置的模型
|
# 测试当前配置的模型
|
||||||
p = os.getenv("LLM_PROVIDER", "ust")
|
p = os.getenv("LLM_PROVIDER", "minimax")
|
||||||
m = os.getenv("LLM_MODEL", "Qwen")
|
m = os.getenv("LLM_MODEL", "Qwen")
|
||||||
|
|
||||||
print(f"Testing {p}/{m}...")
|
print(f"Testing {p}/{m}...")
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ class AutoSynthesisTrainer:
|
|||||||
self.model.load_state_dict(base_model.state_dict(), strict=False)
|
self.model.load_state_dict(base_model.state_dict(), strict=False)
|
||||||
|
|
||||||
# LLM for causality verification
|
# LLM for causality verification
|
||||||
provider = os.getenv("LLM_PROVIDER", "ust")
|
provider = os.getenv("LLM_PROVIDER", "minimax")
|
||||||
model_id = os.getenv("LLM_MODEL", "Qwen")
|
model_id = os.getenv("LLM_MODEL", "Qwen")
|
||||||
self.llm_agent = Agent(model=get_model(provider, model_id))
|
self.llm_agent = Agent(model=get_model(provider, model_id))
|
||||||
|
|
||||||
|
|||||||
@@ -563,7 +563,7 @@ class SearchTools:
|
|||||||
4. Return strictly JSON: {{"reuse": true/false, "index": <candidate_index_int>, "reason": "short explanation"}}
|
4. Return strictly JSON: {{"reuse": true/false, "index": <candidate_index_int>, "reason": "short explanation"}}
|
||||||
"""
|
"""
|
||||||
# 初始化模型
|
# 初始化模型
|
||||||
provider = os.getenv("LLM_PROVIDER", "ust")
|
provider = os.getenv("LLM_PROVIDER", "minimax")
|
||||||
model_id = os.getenv("LLM_MODEL", "Qwen")
|
model_id = os.getenv("LLM_MODEL", "Qwen")
|
||||||
host = os.getenv("LLM_HOST")
|
host = os.getenv("LLM_HOST")
|
||||||
if host:
|
if host:
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ from .database_manager import DatabaseManager
|
|||||||
# 从环境变量读取默认情绪分析模式
|
# 从环境变量读取默认情绪分析模式
|
||||||
DEFAULT_SENTIMENT_MODE = os.getenv("SENTIMENT_MODE", "auto") # auto, bert, llm
|
DEFAULT_SENTIMENT_MODE = os.getenv("SENTIMENT_MODE", "auto") # auto, bert, llm
|
||||||
|
|
||||||
|
|
||||||
class SentimentTools:
|
class SentimentTools:
|
||||||
"""
|
"""
|
||||||
情绪分析工具 - 支持 LLM 和 BERT 两种模式
|
情绪分析工具 - 支持 LLM 和 BERT 两种模式
|
||||||
@@ -21,8 +22,13 @@ class SentimentTools:
|
|||||||
可通过环境变量 SENTIMENT_MODE 设置默认模式。
|
可通过环境变量 SENTIMENT_MODE 设置默认模式。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, db: DatabaseManager, mode: Optional[str] = None,
|
def __init__(
|
||||||
model_provider: str = "openai", model_id: str = "gpt-4o"):
|
self,
|
||||||
|
db: DatabaseManager,
|
||||||
|
mode: Optional[str] = None,
|
||||||
|
model_provider: str = "openai",
|
||||||
|
model_id: str = "gpt-4o",
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
初始化情绪分析工具。
|
初始化情绪分析工具。
|
||||||
|
|
||||||
@@ -39,8 +45,12 @@ class SentimentTools:
|
|||||||
|
|
||||||
# Initialize LLM
|
# Initialize LLM
|
||||||
try:
|
try:
|
||||||
provider = "ust" if os.getenv("UST_KEY_API") else model_provider
|
provider = "minimax" if os.getenv("MINIMAX_API_KEY") else model_provider
|
||||||
m_id = "Qwen" if provider == "ust" else model_id
|
m_id = (
|
||||||
|
os.getenv("LLM_MODEL", "MiniMax-Text-01")
|
||||||
|
if provider == "minimax"
|
||||||
|
else model_id
|
||||||
|
)
|
||||||
self.llm_model = get_model(provider, m_id)
|
self.llm_model = get_model(provider, m_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"LLM initialization skipped: {e}")
|
logger.warning(f"LLM initialization skipped: {e}")
|
||||||
@@ -48,39 +58,59 @@ class SentimentTools:
|
|||||||
# Initialize BERT if needed
|
# Initialize BERT if needed
|
||||||
if self.mode in ["bert", "auto"]:
|
if self.mode in ["bert", "auto"]:
|
||||||
try:
|
try:
|
||||||
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
from transformers import (
|
||||||
|
pipeline,
|
||||||
|
AutoTokenizer,
|
||||||
|
AutoModelForSequenceClassification,
|
||||||
|
)
|
||||||
from transformers.utils import logging as transformers_logging
|
from transformers.utils import logging as transformers_logging
|
||||||
|
|
||||||
transformers_logging.set_verbosity_error() # 减少冗余日志
|
transformers_logging.set_verbosity_error() # 减少冗余日志
|
||||||
|
|
||||||
bert_model = os.getenv("BERT_SENTIMENT_MODEL", "uer/roberta-base-finetuned-chinanews-chinese")
|
bert_model = os.getenv(
|
||||||
|
"BERT_SENTIMENT_MODEL",
|
||||||
|
"uer/roberta-base-finetuned-chinanews-chinese",
|
||||||
|
)
|
||||||
|
|
||||||
# 优先使用本地缓存
|
# 优先使用本地缓存
|
||||||
try:
|
try:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(bert_model, local_files_only=True)
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(bert_model, local_files_only=True)
|
bert_model, local_files_only=True
|
||||||
|
)
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
bert_model, local_files_only=True
|
||||||
|
)
|
||||||
|
|
||||||
self.bert_pipeline = pipeline(
|
self.bert_pipeline = pipeline(
|
||||||
"sentiment-analysis",
|
"sentiment-analysis",
|
||||||
model=model,
|
model=model,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
device=-1
|
device=-1,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"✅ BERT pipeline loaded from local cache: {bert_model}"
|
||||||
)
|
)
|
||||||
logger.info(f"✅ BERT pipeline loaded from local cache: {bert_model}")
|
|
||||||
except (OSError, ValueError, ImportError):
|
except (OSError, ValueError, ImportError):
|
||||||
# 本地没有,则从网络下载
|
# 本地没有,则从网络下载
|
||||||
logger.info(f"📡 Downloading BERT model: {bert_model}...")
|
logger.info(f"📡 Downloading BERT model: {bert_model}...")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(bert_model)
|
tokenizer = AutoTokenizer.from_pretrained(bert_model)
|
||||||
model = AutoModelForSequenceClassification.from_pretrained(bert_model)
|
model = AutoModelForSequenceClassification.from_pretrained(
|
||||||
|
bert_model
|
||||||
|
)
|
||||||
|
|
||||||
self.bert_pipeline = pipeline(
|
self.bert_pipeline = pipeline(
|
||||||
"sentiment-analysis",
|
"sentiment-analysis",
|
||||||
model=model,
|
model=model,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
device=-1
|
device=-1,
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"✅ BERT Sentiment pipeline ({bert_model}) initialized."
|
||||||
)
|
)
|
||||||
logger.info(f"✅ BERT Sentiment pipeline ({bert_model}) initialized.")
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning("Transformers library not installed. BERT sentiment analysis disabled.")
|
logger.warning(
|
||||||
|
"Transformers library not installed. BERT sentiment analysis disabled."
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.mode == "bert":
|
if self.mode == "bert":
|
||||||
logger.error(f"BERT mode requested but failed: {e}")
|
logger.error(f"BERT mode requested but failed: {e}")
|
||||||
@@ -88,7 +118,6 @@ class SentimentTools:
|
|||||||
logger.warning(f"BERT unavailable, using LLM only. Error: {e}")
|
logger.warning(f"BERT unavailable, using LLM only. Error: {e}")
|
||||||
self.bert_pipeline = None
|
self.bert_pipeline = None
|
||||||
|
|
||||||
|
|
||||||
def analyze_sentiment(self, text: str) -> Dict[str, Union[float, str]]:
|
def analyze_sentiment(self, text: str) -> Dict[str, Union[float, str]]:
|
||||||
"""
|
"""
|
||||||
分析文本的情绪极性。根据初始化时的 mode 自动选择分析方法。
|
分析文本的情绪极性。根据初始化时的 mode 自动选择分析方法。
|
||||||
@@ -155,32 +184,43 @@ class SentimentTools:
|
|||||||
与输入列表等长的分析结果列表。
|
与输入列表等长的分析结果列表。
|
||||||
"""
|
"""
|
||||||
if not self.bert_pipeline:
|
if not self.bert_pipeline:
|
||||||
return [{"score": 0.0, "label": "error", "reason": "BERT not available"}] * len(texts)
|
return [
|
||||||
|
{"score": 0.0, "label": "error", "reason": "BERT not available"}
|
||||||
|
] * len(texts)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
results = self.bert_pipeline(texts, truncation=True, max_length=512)
|
results = self.bert_pipeline(texts, truncation=True, max_length=512)
|
||||||
processed = []
|
processed = []
|
||||||
for r in results:
|
for r in results:
|
||||||
label = r['label'].lower()
|
label = r["label"].lower()
|
||||||
score = r['score']
|
score = r["score"]
|
||||||
|
|
||||||
# 标准化不同模型的标签格式
|
# 标准化不同模型的标签格式
|
||||||
if 'negative' in label or 'neg' in label:
|
if "negative" in label or "neg" in label:
|
||||||
score = -score
|
score = -score
|
||||||
elif 'neutral' in label or 'neu' in label:
|
elif "neutral" in label or "neu" in label:
|
||||||
score = 0.0
|
score = 0.0
|
||||||
|
|
||||||
processed.append({
|
processed.append(
|
||||||
|
{
|
||||||
"score": float(round(score, 3)),
|
"score": float(round(score, 3)),
|
||||||
"label": "positive" if score > 0.1 else ("negative" if score < -0.1 else "neutral"),
|
"label": "positive"
|
||||||
"reason": "BERT automated analysis"
|
if score > 0.1
|
||||||
})
|
else ("negative" if score < -0.1 else "neutral"),
|
||||||
|
"reason": "BERT automated analysis",
|
||||||
|
}
|
||||||
|
)
|
||||||
return processed
|
return processed
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"BERT analysis failed: {e}")
|
logger.error(f"BERT analysis failed: {e}")
|
||||||
return [{"score": 0.0, "label": "error", "reason": str(e)}] * len(texts)
|
return [{"score": 0.0, "label": "error", "reason": str(e)}] * len(texts)
|
||||||
|
|
||||||
def batch_update_news_sentiment(self, source: Optional[str] = None, limit: int = 50, use_bert: Optional[bool] = None):
|
def batch_update_news_sentiment(
|
||||||
|
self,
|
||||||
|
source: Optional[str] = None,
|
||||||
|
limit: int = 50,
|
||||||
|
use_bert: Optional[bool] = None,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
批量更新数据库中新闻的情绪分数。
|
批量更新数据库中新闻的情绪分数。
|
||||||
|
|
||||||
@@ -193,38 +233,54 @@ class SentimentTools:
|
|||||||
成功更新的新闻数量。
|
成功更新的新闻数量。
|
||||||
"""
|
"""
|
||||||
news_items = self.db.get_daily_news(source=source, limit=limit)
|
news_items = self.db.get_daily_news(source=source, limit=limit)
|
||||||
to_analyze = [item for item in news_items if not item.get('sentiment_score')]
|
to_analyze = [item for item in news_items if not item.get("sentiment_score")]
|
||||||
|
|
||||||
if not to_analyze:
|
if not to_analyze:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# 决定使用哪种方法
|
# 决定使用哪种方法
|
||||||
should_use_bert = use_bert if use_bert is not None else (self.bert_pipeline is not None and self.mode != "llm")
|
should_use_bert = (
|
||||||
|
use_bert
|
||||||
|
if use_bert is not None
|
||||||
|
else (self.bert_pipeline is not None and self.mode != "llm")
|
||||||
|
)
|
||||||
|
|
||||||
updated_count = 0
|
updated_count = 0
|
||||||
cursor = self.db.conn.cursor()
|
cursor = self.db.conn.cursor()
|
||||||
|
|
||||||
if should_use_bert and self.bert_pipeline:
|
if should_use_bert and self.bert_pipeline:
|
||||||
logger.info(f"🚀 Using BERT for batch analysis of {len(to_analyze)} items...")
|
logger.info(
|
||||||
titles = [item['title'] for item in to_analyze]
|
f"🚀 Using BERT for batch analysis of {len(to_analyze)} items..."
|
||||||
|
)
|
||||||
|
titles = [item["title"] for item in to_analyze]
|
||||||
results = self.analyze_sentiment_bert(titles)
|
results = self.analyze_sentiment_bert(titles)
|
||||||
|
|
||||||
for item, analysis in zip(to_analyze, results):
|
for item, analysis in zip(to_analyze, results):
|
||||||
cursor.execute("""
|
cursor.execute(
|
||||||
|
"""
|
||||||
UPDATE daily_news
|
UPDATE daily_news
|
||||||
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
""", (analysis['score'], analysis['reason'], item['id']))
|
""",
|
||||||
|
(analysis["score"], analysis["reason"], item["id"]),
|
||||||
|
)
|
||||||
updated_count += 1
|
updated_count += 1
|
||||||
else:
|
else:
|
||||||
logger.info(f"🚶 Using LLM for analysis of {len(to_analyze)} items...")
|
logger.info(f"🚶 Using LLM for analysis of {len(to_analyze)} items...")
|
||||||
for item in to_analyze:
|
for item in to_analyze:
|
||||||
analysis = self.analyze_sentiment_llm(item['title'])
|
analysis = self.analyze_sentiment_llm(item["title"])
|
||||||
cursor.execute("""
|
cursor.execute(
|
||||||
|
"""
|
||||||
UPDATE daily_news
|
UPDATE daily_news
|
||||||
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
SET sentiment_score = ?, meta_data = json_set(COALESCE(meta_data, '{}'), '$.sentiment_reason', ?)
|
||||||
WHERE id = ?
|
WHERE id = ?
|
||||||
""", (analysis.get('score', 0.0), analysis.get('reason', ''), item['id']))
|
""",
|
||||||
|
(
|
||||||
|
analysis.get("score", 0.0),
|
||||||
|
analysis.get("reason", ""),
|
||||||
|
item["id"],
|
||||||
|
),
|
||||||
|
)
|
||||||
updated_count += 1
|
updated_count += 1
|
||||||
|
|
||||||
self.db.conn.commit()
|
self.db.conn.commit()
|
||||||
|
|||||||
Reference in New Issue
Block a user