chore: push all remaining changes
- Blog writer enhancements and bug fixes - Wix integration improvements - Frontend UI updates - GSC dashboard docs cleanup - Image studio assets - LinkedIn requirements file - Various dependency updates
This commit is contained in:
@@ -18,7 +18,7 @@ class CompetitorAnalyzer:
|
||||
Analyze the following research content and extract competitor insights:
|
||||
|
||||
Research Content:
|
||||
{content[:3000]}
|
||||
{content[:8000]}
|
||||
|
||||
Extract and analyze:
|
||||
1. Top competitors mentioned (companies, brands, platforms)
|
||||
|
||||
@@ -17,7 +17,7 @@ class ContentAngleGenerator:
|
||||
Analyze the following research content and create strategic content angles for: {topic} in {industry}
|
||||
|
||||
Research Content:
|
||||
{content[:3000]}
|
||||
{content[:8000]}
|
||||
|
||||
Create 7 compelling content angles that:
|
||||
1. Leverage current trends and data from the research
|
||||
|
||||
@@ -7,6 +7,8 @@ Neural search implementation using Exa API for high-quality, citation-rich resea
|
||||
from exa_py import Exa
|
||||
import os
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
from typing import List, Dict, Any
|
||||
from loguru import logger
|
||||
from models.subscription_models import APIProvider
|
||||
@@ -355,6 +357,125 @@ class ExaResearchProvider(BaseProvider):
|
||||
|
||||
return None
|
||||
|
||||
def _calculate_credibility_score(self, result) -> float:
|
||||
"""Dynamic credibility score based on domain authority, recency, and content substance."""
|
||||
scores = []
|
||||
weights = []
|
||||
|
||||
# Domain authority (weight: 3) — most important signal
|
||||
url = result.url if hasattr(result, 'url') else ''
|
||||
domain_score = self._score_domain_authority(url)
|
||||
scores.append(domain_score)
|
||||
weights.append(3)
|
||||
|
||||
# Recency (weight: 2) — fresher content is more valuable
|
||||
recency_score = self._score_recency(result)
|
||||
scores.append(recency_score)
|
||||
weights.append(2)
|
||||
|
||||
# Content substance (weight: 2) — richer content = more substantive source
|
||||
substance_score = self._score_substance(result)
|
||||
scores.append(substance_score)
|
||||
weights.append(2)
|
||||
|
||||
# Exa relevance score (weight: 2) — Exa's own relevance ranking
|
||||
exa_score = 0.5
|
||||
if hasattr(result, 'score') and result.score is not None:
|
||||
exa_score = float(result.score)
|
||||
scores.append(exa_score)
|
||||
weights.append(2)
|
||||
|
||||
total = sum(s * w for s, w in zip(scores, weights))
|
||||
total_weight = sum(weights)
|
||||
return round(total / total_weight, 3)
|
||||
|
||||
@staticmethod
|
||||
def _score_domain_authority(url: str) -> float:
|
||||
if not url:
|
||||
return 0.5
|
||||
try:
|
||||
domain = urlparse(url).netloc.lower()
|
||||
except Exception:
|
||||
return 0.5
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
|
||||
# Tier 1: Government, educational, major research
|
||||
if domain.endswith('.gov') or domain.endswith('.edu'):
|
||||
return 0.95
|
||||
if domain in ('arxiv.org', 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov',
|
||||
'scholar.google.com', 'researchgate.net', 'sciencedaily.com',
|
||||
'nature.com', 'science.org', 'pnas.org'):
|
||||
return 0.92
|
||||
|
||||
# Tier 2: Major established news and professional publications
|
||||
tier2 = {
|
||||
'reuters.com', 'apnews.com', 'bbc.com', 'bbc.co.uk', 'npr.org',
|
||||
'wsj.com', 'nytimes.com', 'economist.com', 'bloomberg.com',
|
||||
'theguardian.com', 'ft.com', 'washingtonpost.com',
|
||||
'forbes.com', 'hbr.org', 'techcrunch.com', 'wired.com',
|
||||
'cnn.com', 'nbcnews.com', 'cbsnews.com', 'abcnews.go.com',
|
||||
}
|
||||
# Extract base domain
|
||||
parts = domain.split('.')
|
||||
base = '.'.join(parts[-2:]) if len(parts) >= 2 else domain
|
||||
if base in tier2:
|
||||
return 0.88
|
||||
|
||||
# Tier 3: Industry research and established .org
|
||||
tier3 = {
|
||||
'statista.com', 'pewresearch.org', 'gartner.com', 'mckinsey.com',
|
||||
'deloitte.com', 'pwc.com', 'ey.com', 'kpmg.com',
|
||||
'hubspot.com', 'moz.com', 'searchengineland.com',
|
||||
'neilpatel.com', 'backlinko.com', 'copyblogger.com',
|
||||
}
|
||||
if base in tier3:
|
||||
return 0.80
|
||||
if domain.endswith('.org'):
|
||||
return 0.75
|
||||
|
||||
return 0.60
|
||||
|
||||
def _score_recency(self, result) -> float:
|
||||
if not hasattr(result, 'publishedDate') or not result.publishedDate:
|
||||
return 0.70
|
||||
try:
|
||||
published = datetime.strptime(result.publishedDate[:10], '%Y-%m-%d')
|
||||
days_old = (datetime.now() - published).days
|
||||
if days_old < 30:
|
||||
return 1.0
|
||||
elif days_old < 180:
|
||||
return 0.90
|
||||
elif days_old < 365:
|
||||
return 0.80
|
||||
elif days_old < 730:
|
||||
return 0.65
|
||||
elif days_old < 1825:
|
||||
return 0.45
|
||||
else:
|
||||
return 0.25
|
||||
except Exception:
|
||||
return 0.70
|
||||
|
||||
def _score_substance(self, result) -> float:
|
||||
total_chars = 0
|
||||
if hasattr(result, 'highlights') and result.highlights:
|
||||
total_chars += sum(len(h or '') for h in result.highlights)
|
||||
if hasattr(result, 'summary') and result.summary:
|
||||
total_chars += len(result.summary)
|
||||
if hasattr(result, 'text') and result.text:
|
||||
total_chars += len(result.text)
|
||||
|
||||
if total_chars > 2000:
|
||||
return 0.95
|
||||
elif total_chars > 1000:
|
||||
return 0.85
|
||||
elif total_chars > 500:
|
||||
return 0.75
|
||||
elif total_chars > 100:
|
||||
return 0.60
|
||||
return 0.40
|
||||
|
||||
def _transform_sources(self, results):
|
||||
"""Transform Exa results to ResearchSource format."""
|
||||
sources = []
|
||||
@@ -368,7 +489,7 @@ class ExaResearchProvider(BaseProvider):
|
||||
'title': result.title if hasattr(result, 'title') else '',
|
||||
'url': result.url if hasattr(result, 'url') else '',
|
||||
'excerpt': self._get_excerpt(result),
|
||||
'credibility_score': 0.85, # Exa results are high quality
|
||||
'credibility_score': self._calculate_credibility_score(result),
|
||||
'published_at': result.publishedDate if hasattr(result, 'publishedDate') else None,
|
||||
'index': idx,
|
||||
'source_type': source_type,
|
||||
@@ -388,7 +509,7 @@ class ExaResearchProvider(BaseProvider):
|
||||
if hasattr(result, 'summary') and result.summary:
|
||||
return result.summary
|
||||
if hasattr(result, 'text') and result.text:
|
||||
return result.text[:500]
|
||||
return result.text[:1000]
|
||||
return ''
|
||||
|
||||
def _determine_source_type(self, url):
|
||||
|
||||
@@ -19,7 +19,7 @@ class KeywordAnalyzer:
|
||||
Analyze the following research content and extract comprehensive keyword insights for: {', '.join(original_keywords)}
|
||||
|
||||
Research Content:
|
||||
{content[:3000]} # Limit to avoid token limits
|
||||
{content[:8000]}
|
||||
|
||||
Extract and analyze:
|
||||
1. Primary keywords (main topic terms)
|
||||
|
||||
@@ -250,10 +250,32 @@ class ResearchService:
|
||||
if 'content' not in locals() or 'sources' not in locals():
|
||||
raise RuntimeError(f"{config.provider.value} research did not return content or sources. Research failed.")
|
||||
|
||||
# Build compact all-source summary for richer analysis
|
||||
analysis_content = self._build_analysis_content(sources)
|
||||
|
||||
# Run dedicated competitor search for richer competitor intelligence
|
||||
competitor_content = analysis_content
|
||||
try:
|
||||
comp_query = f"top {industry} companies or competitors {topic}"
|
||||
comp_results = await exa_provider.simple_search(
|
||||
query=comp_query, num_results=5, user_id=user_id,
|
||||
)
|
||||
if comp_results:
|
||||
comp_lines = ["COMPETITOR SEARCH RESULTS:"]
|
||||
for r in comp_results:
|
||||
title = r.get('title', '')
|
||||
text = (r.get('text', '') or '')[:400]
|
||||
comp_lines.append(f"- {title}")
|
||||
if text:
|
||||
comp_lines.append(f" {text[:200]}")
|
||||
competitor_content = "\n".join(comp_lines) + "\n\n" + analysis_content
|
||||
except Exception as e:
|
||||
logger.warning(f"Competitor search failed (non-critical): {e}")
|
||||
|
||||
# Continue with common analysis (same for both providers)
|
||||
keyword_analysis = self.keyword_analyzer.analyze(content, request.keywords, user_id=user_id)
|
||||
competitor_analysis = self.competitor_analyzer.analyze(content, user_id=user_id)
|
||||
suggested_angles = self.content_angle_generator.generate(content, topic, industry, user_id=user_id)
|
||||
keyword_analysis = self.keyword_analyzer.analyze(analysis_content, request.keywords, user_id=user_id)
|
||||
competitor_analysis = self.competitor_analyzer.analyze(competitor_content, user_id=user_id)
|
||||
suggested_angles = self.content_angle_generator.generate(analysis_content, topic, industry, user_id=user_id)
|
||||
|
||||
logger.info(f"Research completed successfully with {len(sources)} sources and {len(search_queries)} search queries")
|
||||
|
||||
@@ -586,9 +608,30 @@ class ResearchService:
|
||||
|
||||
# Continue with common analysis (same for both providers)
|
||||
await task_manager.update_progress(task_id, "🔍 Analyzing keywords and content angles...")
|
||||
keyword_analysis = self.keyword_analyzer.analyze(content, request.keywords, user_id=user_id)
|
||||
competitor_analysis = self.competitor_analyzer.analyze(content, user_id=user_id)
|
||||
suggested_angles = self.content_angle_generator.generate(content, topic, industry, user_id=user_id)
|
||||
analysis_content = self._build_analysis_content(sources)
|
||||
|
||||
# Run dedicated competitor search for richer competitor intelligence
|
||||
competitor_content = analysis_content
|
||||
try:
|
||||
comp_query = f"top {industry} companies or competitors {topic}"
|
||||
comp_results = await exa_provider.simple_search(
|
||||
query=comp_query, num_results=5, user_id=user_id,
|
||||
)
|
||||
if comp_results:
|
||||
comp_lines = ["COMPETITOR SEARCH RESULTS:"]
|
||||
for r in comp_results:
|
||||
title = r.get('title', '')
|
||||
text = (r.get('text', '') or '')[:400]
|
||||
comp_lines.append(f"- {title}")
|
||||
if text:
|
||||
comp_lines.append(f" {text[:200]}")
|
||||
competitor_content = "\n".join(comp_lines) + "\n\n" + analysis_content
|
||||
except Exception as e:
|
||||
logger.warning(f"Competitor search failed (non-critical): {e}")
|
||||
|
||||
keyword_analysis = self.keyword_analyzer.analyze(analysis_content, request.keywords, user_id=user_id)
|
||||
competitor_analysis = self.competitor_analyzer.analyze(competitor_content, user_id=user_id)
|
||||
suggested_angles = self.content_angle_generator.generate(analysis_content, topic, industry, user_id=user_id)
|
||||
|
||||
await task_manager.update_progress(task_id, "💾 Caching results for future use...")
|
||||
logger.info(f"Research completed successfully with {len(sources)} sources and {len(search_queries)} search queries")
|
||||
@@ -780,6 +823,33 @@ class ResearchService:
|
||||
web_search_queries=search_queries or [],
|
||||
)
|
||||
|
||||
def _build_analysis_content(self, sources: List[Dict[str, Any]]) -> str:
|
||||
"""Build compact all-source summary for LLM analysis.
|
||||
|
||||
Each source is distilled to one line with title, key content, and highlights.
|
||||
This ensures ALL sources are visible to keyword, competitor, and angle
|
||||
analyzers instead of only the first few (raw content[:3000]).
|
||||
"""
|
||||
if not sources:
|
||||
return ""
|
||||
lines = []
|
||||
for src in sources:
|
||||
title = src.get('title', '') or ''
|
||||
summary = src.get('summary', '') or ''
|
||||
highlights = src.get('highlights', []) or []
|
||||
excerpt = src.get('excerpt', '') or ''
|
||||
part = f"• {title}"
|
||||
if summary:
|
||||
part += f" — {summary[:250]}"
|
||||
elif excerpt:
|
||||
part += f" — {excerpt[:250]}"
|
||||
if highlights:
|
||||
findings = [h[:120] for h in highlights[:2] if h]
|
||||
if findings:
|
||||
part += f" | {'; '.join(findings)}"
|
||||
lines.append(part)
|
||||
return "\n".join(lines)
|
||||
|
||||
def _normalize_cached_research_data(self, cached_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Normalize cached research data to fix None values in confidence_scores.
|
||||
|
||||
Reference in New Issue
Block a user