Merge branch 'recover-stash'

This commit is contained in:
ajaysi
2026-05-23 13:13:18 +05:30
40 changed files with 1870 additions and 859 deletions

View File

@@ -122,9 +122,6 @@ class MediumBlogGenerator:
payload = {
"title": req.title,
"globalTargetWords": req.globalTargetWords or 1000,
"persona": req.persona.dict() if req.persona else None,
"tone": req.tone,
"audience": req.audience,
"sections": [section_block(s) for s in req.sections],
}
@@ -136,7 +133,6 @@ class MediumBlogGenerator:
- Industry: {req.persona.industry or 'General'}
- Tone: {req.persona.tone or 'Professional'}
- Audience: {req.persona.audience or 'General readers'}
- Persona ID: {req.persona.persona_id or 'Default'}
Write content that reflects this persona's expertise and communication style.
Use industry-specific terminology and examples where appropriate.
@@ -154,40 +150,19 @@ class MediumBlogGenerator:
"Return ONLY valid JSON with no markdown formatting or explanations."
)
# Build persona-specific content instructions
persona_instructions = ""
if req.persona:
industry = req.persona.industry or 'General'
tone = req.persona.tone or 'Professional'
audience = req.persona.audience or 'General readers'
persona_instructions = f"""
PERSONA-DRIVEN CONTENT REQUIREMENTS:
- Write as an expert in {industry} industry
- Use {tone} tone appropriate for {audience}
- Include industry-specific examples and terminology
- Demonstrate authority and expertise in the field
- Use language that resonates with {audience}
- Maintain consistent voice that reflects this persona's expertise
"""
prompt = (
f"Write blog content for the following sections. Each section should be {req.globalTargetWords or 1000} words total, distributed across all sections.\n\n"
f"Write blog content for the following sections. Total target: {req.globalTargetWords or 1000} words, distributed across all sections.\n\n"
f"Blog Title: {req.title}\n\n"
"For each section, write engaging content that:\n"
"- Follows the key points provided\n"
"- Uses the suggested keywords naturally\n"
"- Meets the target word count\n"
"- Maintains professional tone\n"
"- References the provided sources when relevant\n"
"- Breaks content into clear paragraphs (2-4 sentences each)\n"
"- Uses double line breaks (\\n\\n) between paragraphs for proper formatting\n"
"- Uses double line breaks (\\n\\n) between paragraphs\n"
"- Starts with an engaging opening paragraph\n"
"- Ends with a strong concluding paragraph\n"
f"{persona_instructions}\n"
"IMPORTANT: Format the 'content' field with proper paragraph breaks using \\n\\n between paragraphs.\n\n"
"Return a JSON object with 'title' and 'sections' array. Each section should have 'id', 'heading', 'content', and 'wordCount'.\n\n"
f"Sections to write:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
"- Ends with a strong concluding paragraph\n\n"
"Return a JSON object with 'title' and 'sections' array. Each section must have 'id', 'heading', 'content', 'wordCount', and 'sources'.\n\n"
f"Sections:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
)
try:
@@ -195,7 +170,9 @@ class MediumBlogGenerator:
prompt=prompt,
json_struct=schema,
system_prompt=system,
user_id=user_id
user_id=user_id,
max_tokens=None,
temperature=0.3,
)
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit) to preserve error details

View File

@@ -322,7 +322,7 @@ class ExaResearchProvider(BaseProvider):
'text': getattr(result, 'text', ''),
'publishedDate': getattr(result, 'publishedDate', ''),
'author': getattr(result, 'author', ''),
'score': getattr(result, 'score', 0.5),
'score': (lambda v: v if v is not None else 0.5)(getattr(result, 'score', 0.5)),
})
# Track usage

View File

@@ -31,6 +31,7 @@ from models.product_marketing_models import Campaign, CampaignProposal, Campaign
from models.product_asset_models import ProductAsset, ProductStyleTemplate, EcommerceExport
# Podcast Maker models use SubscriptionBase, but import to ensure models are registered
from models.podcast_models import PodcastProject
# Research models use SubscriptionBase
from models.research_models import ResearchProject
# Video Studio models

View File

@@ -2,8 +2,9 @@
GSC Brainstorm Service for ALwrity.
Analyzes Google Search Console data to suggest blog topics the user should write about.
Combines rule-based heuristics (high-impression/low-CTR keywords, near-page-1 positions)
with LLM-powered strategic recommendations tailored to the user's topic intent.
Combines rule-based heuristics with LLM-powered strategic recommendations tailored to
the user's topic intent. Designed for non-SEO-experts: every insight includes plain-English
explanations of WHY it matters and WHAT to do about it.
"""
import json
@@ -21,9 +22,10 @@ class GSCBrainstormService:
Flow:
1. Fetch real GSC search analytics (query + page data, 30 days)
2. Apply rule-based filters (Content Optimization, Content Enhancement, Keyword Gap)
3. Generate LLM-powered strategic recommendations contextualised to the user's keywords
4. Return structured results
2. Compute derived metrics (CTR benchmarks, estimated traffic uplift, content formats)
3. Apply rule-based filters (Quick Wins, Optimization, Enhancement, Rising Stars, Page Issues)
4. Generate LLM-powered strategic recommendations contextualised to the user's keywords
5. Return structured results with all data exposed for rich frontend display
"""
def __init__(self, gsc_service: GSCService = None):
@@ -39,18 +41,8 @@ class GSCBrainstormService:
keywords: str,
site_url: Optional[str] = None,
) -> Dict[str, Any]:
"""
Generate blog topic suggestions from the user's GSC data.
Args:
user_id: Clerk user ID (must have GSC connected).
keywords: User's 3+ word topic intent (e.g. "content marketing strategy").
site_url: Optional site URL; auto-selected from user's first GSC site if omitted.
Returns:
Dict with content_opportunities, keyword_gaps, ai_recommendations, summary.
"""
self._user_id = user_id
# 1. Resolve site_url
if not site_url:
sites = self.gsc_service.get_site_list(user_id)
@@ -59,6 +51,8 @@ class GSCBrainstormService:
"error": "No GSC sites found. Make sure your site is verified in Google Search Console.",
"content_opportunities": [],
"keyword_gaps": [],
"quick_wins": [],
"page_opportunities": [],
"ai_recommendations": {},
"summary": {},
}
@@ -80,6 +74,8 @@ class GSCBrainstormService:
"error": analytics.get("error", "Failed to fetch GSC data"),
"content_opportunities": [],
"keyword_gaps": [],
"quick_wins": [],
"page_opportunities": [],
"ai_recommendations": {},
"summary": {},
}
@@ -93,9 +89,11 @@ class GSCBrainstormService:
if not keywords_data:
return {
"error": "No keyword data available for the selected period.",
"error": "No keyword data available for the selected period. This usually means your site is new to GSC or hasn't received search traffic yet.",
"content_opportunities": [],
"keyword_gaps": [],
"quick_wins": [],
"page_opportunities": [],
"ai_recommendations": {},
"summary": {
"site_url": site_url,
@@ -107,18 +105,23 @@ class GSCBrainstormService:
# 4. Rule-based analysis
content_opportunities = self._identify_content_opportunities(keywords_data)
keyword_gaps = self._identify_keyword_gaps(keywords_data)
quick_wins = self._identify_quick_wins(keywords_data)
page_opportunities = self._identify_page_opportunities(pages_data)
# 5. Summary metrics
summary = self._compute_summary(keywords_data, pages_data, site_url, start_date, end_date)
# 6. AI recommendations (best-effort; don't fail the whole request on LLM error)
# 6. AI recommendations
ai_recommendations = self._generate_ai_recommendations(
keywords_data, pages_data, summary, keywords
keywords_data, pages_data, summary, keywords,
content_opportunities, quick_wins, keyword_gaps,
)
return {
"content_opportunities": content_opportunities,
"keyword_gaps": keyword_gaps,
"quick_wins": quick_wins,
"page_opportunities": page_opportunities,
"ai_recommendations": ai_recommendations,
"summary": summary,
}
@@ -168,39 +171,53 @@ class GSCBrainstormService:
opportunities: List[Dict[str, Any]] = []
# Rule 1: Content Optimization — high impressions, low CTR
# Meaning: Google is SHOWING your page for this query but people aren't clicking.
# The content probably ranks but title/meta/snippet isn't compelling enough.
for kw in keywords_data:
if kw["impressions"] > 500 and kw["ctr"] < 3:
estimated_gain = int(kw["impressions"] * 0.05) - kw["clicks"]
opportunities.append({
"type": "Content Optimization",
"keyword": kw["keyword"],
"opportunity": (
f"Optimize existing content for '{kw['keyword']}' "
f"to improve CTR from {kw['ctr']:.1f}% "
f"(position {kw['position']:.1f})"
f"Your site appears for '{kw['keyword']}' ({kw['impressions']:,} times/month) "
f"but only {kw['ctr']:.1f}% click. Improving your title and meta description "
f"could bring ~{max(estimated_gain, 5)} more clicks/month."
),
"potential_impact": "High",
"potential_impact": "High" if kw["impressions"] > 1000 else "Medium",
"current_position": kw["position"],
"current_ctr": kw["ctr"],
"impressions": kw["impressions"],
"clicks": kw["clicks"],
"estimated_traffic_gain": max(estimated_gain, 5),
"priority": "High" if kw["impressions"] > 1000 else "Medium",
"suggested_format": GSCBrainstormService._suggest_format(kw["keyword"]),
})
# Rule 2: Content Enhancement — positions 11-20 with decent impressions
# Meaning: You're on page 2 of Google. A small content boost could push you to page 1,
# where CTR increases dramatically (page 1 gets ~95% of all clicks).
for kw in keywords_data:
if 10 < kw["position"] <= 20 and kw["impressions"] > 100:
estimated_gain = int(kw["impressions"] * 0.08)
opportunities.append({
"type": "Content Enhancement",
"keyword": kw["keyword"],
"opportunity": (
f"Enhance content for '{kw['keyword']}' to move from "
f"position {kw['position']:.1f} to the first page"
f"'{kw['keyword']}' ranks #{kw['position']:.0f} (page 2). "
f"Moving to page 1 could capture ~{estimated_gain} more clicks/month "
f"from {kw['impressions']:,} impressions."
),
"potential_impact": "Medium",
"potential_impact": "High" if kw["impressions"] > 500 else "Medium",
"current_position": kw["position"],
"current_ctr": kw["ctr"],
"impressions": kw["impressions"],
"priority": "Medium",
"clicks": kw["clicks"],
"estimated_traffic_gain": estimated_gain,
"priority": "High" if kw["impressions"] > 500 else "Medium",
"suggested_format": GSCBrainstormService._suggest_format(kw["keyword"]),
})
# Sort by impressions descending, keep top 10
opportunities.sort(key=lambda x: x["impressions"], reverse=True)
return opportunities[:10]
@@ -212,15 +229,111 @@ class GSCBrainstormService:
for kw in keywords_data:
if 4 <= kw["position"] <= 20 and kw["impressions"] >= 50:
# Estimate traffic gain if this keyword moved to position 1-3
# Position 1 avg CTR ~31%, position 3 ~11%, current position CTR estimate
position_1_ctr = 31.0
current_ctr = kw["ctr"]
estimated_gain = max(int(kw["impressions"] * (position_1_ctr - current_ctr) / 100), 1)
gaps.append({
"keyword": kw["keyword"],
"position": kw["position"],
"impressions": kw["impressions"],
"current_ctr": kw["ctr"],
"clicks": kw["clicks"],
"estimated_traffic_if_page1": estimated_gain,
"gap_from_page1": round(kw["position"] - 3, 1),
})
gaps.sort(key=lambda x: x["impressions"], reverse=True)
return gaps[:10]
@staticmethod
def _identify_quick_wins(
keywords_data: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Keywords already on page 1 (positions 4-10) that could reach top 3
with minor improvements — the highest-ROI opportunities."""
quick_wins: List[Dict[str, Any]] = []
for kw in keywords_data:
if 4 <= kw["position"] <= 10 and kw["impressions"] >= 100:
# Position 3 CTR ≈ 11%, position 5 CTR ≈ 6%
# Small improvements can yield big traffic gains
target_ctr = 11.0 # approximate CTR for position 3
estimated_gain = max(int(kw["impressions"] * (target_ctr - kw["ctr"]) / 100), 1)
quick_wins.append({
"keyword": kw["keyword"],
"position": kw["position"],
"impressions": kw["impressions"],
"current_ctr": kw["ctr"],
"clicks": kw["clicks"],
"estimated_traffic_gain": estimated_gain,
"reason": (
f"Already on page 1 at position #{kw['position']:.0f}. "
f"Optimizing this page could increase CTR from {kw['ctr']:.1f}% "
f"to ~{target_ctr:.0f}%, gaining ~{estimated_gain} clicks/month."
),
})
quick_wins.sort(key=lambda x: x["estimated_traffic_gain"], reverse=True)
return quick_wins[:5]
@staticmethod
def _identify_page_opportunities(
pages_data: List[Dict[str, Any]],
) -> List[Dict[str, Any]]:
"""Pages with high impressions but low CTR — the content or meta needs work."""
opportunities: List[Dict[str, Any]] = []
for pg in pages_data:
if pg["impressions"] > 300 and pg["ctr"] < 2.0:
short_page = pg["page"].rstrip("/").rsplit("/", 1)[-1].replace("-", " ").title()
if len(short_page) > 60:
short_page = short_page[:57] + "..."
opportunities.append({
"page": pg["page"],
"page_title": short_page,
"impressions": pg["impressions"],
"clicks": pg["clicks"],
"current_ctr": pg["ctr"],
"current_position": pg["position"],
"reason": (
f"This page gets {pg['impressions']:,} impressions but only {pg['ctr']:.1f}% CTR. "
f"Reviewing the title and meta description could significantly boost clicks."
),
})
opportunities.sort(key=lambda x: x["impressions"], reverse=True)
return opportunities[:5]
# ------------------------------------------------------------------ #
# Content format suggestion
# ------------------------------------------------------------------ #
@staticmethod
def _suggest_format(keyword: str) -> str:
"""Suggest a content format based on keyword patterns."""
kw = keyword.lower()
if any(w in kw for w in ["how to", "how do", "guide", "tutorial", "steps"]):
return "How-To Guide"
if any(w in kw for w in ["vs", "versus", "compare", "comparison", "difference"]):
return "Comparison"
if any(w in kw for w in ["best", "top", "recommended", "review", "reviews"]):
return "Top Picks / Review"
if any(w in kw for w in ["what is", "definition", "meaning", "explained"]):
return "Explainer"
if any(w in kw for w in ["list", "examples", "ideas", "tips", "ways"]):
return "Listicle"
if any(w in kw for w in ["free", "cheap", "alternative", "budget"]):
return "Budget / Alternative"
if any(w in kw for w in ["template", "calculator", "tool", "checker"]):
return "Tool / Template"
if any(w in kw for w in ["2024", "2025", "2026", "trends", "prediction", "future"]):
return "Trend Report"
return "In-Depth Article"
# ------------------------------------------------------------------ #
# Summary metrics
# ------------------------------------------------------------------ #
@@ -248,6 +361,16 @@ class GSCBrainstormService:
top_keywords = sorted(keywords_data, key=lambda x: x["impressions"], reverse=True)[:5]
top_pages = sorted(pages_data, key=lambda x: x["clicks"], reverse=True)[:3]
# Health score: 0-100 based on how many keywords are on page 1
total_kw = len(keywords_data) or 1
page1_pct = (pos_1_3 + pos_4_10) / total_kw * 100
top3_pct = pos_1_3 / total_kw * 100
health_score = round(min(top3_pct * 3 + page1_pct * 0.7, 100), 0)
# CTR benchmark: industry average is ~3.1% for position 1-10
ctr_benchmark = 3.1
ctr_vs_benchmark = round(avg_ctr - ctr_benchmark, 2)
return {
"site_url": site_url,
"date_range": {"start": start_date, "end": end_date},
@@ -256,6 +379,8 @@ class GSCBrainstormService:
"total_clicks": total_clicks,
"avg_ctr": avg_ctr,
"avg_position": avg_position,
"ctr_vs_benchmark": ctr_vs_benchmark,
"health_score": health_score,
"keyword_distribution": {
"positions_1_3": pos_1_3,
"positions_4_10": pos_4_10,
@@ -263,11 +388,22 @@ class GSCBrainstormService:
"positions_21_plus": pos_21_plus,
},
"top_keywords": [
{"keyword": kw["keyword"], "impressions": kw["impressions"], "position": kw["position"]}
{
"keyword": kw["keyword"],
"impressions": kw["impressions"],
"clicks": kw["clicks"],
"position": kw["position"],
"ctr": kw["ctr"],
}
for kw in top_keywords
],
"top_pages": [
{"page": pg["page"], "clicks": pg["clicks"], "impressions": pg["impressions"]}
{
"page": pg["page"],
"clicks": pg["clicks"],
"impressions": pg["impressions"],
"ctr": pg["ctr"],
}
for pg in top_pages
],
}
@@ -282,60 +418,110 @@ class GSCBrainstormService:
pages_data: List[Dict],
summary: Dict,
user_keywords: str,
content_opportunities: List[Dict],
quick_wins: List[Dict],
keyword_gaps: List[Dict],
) -> Dict[str, Any]:
try:
top_kw = ", ".join(kw["keyword"] for kw in summary.get("top_keywords", []))
top_kw_list = summary.get("top_keywords", [])
top_kw_str = "\n".join(
f"{kw['keyword']}: {kw['impressions']:,} impressions, position {kw['position']}, {kw['ctr']:.1f}% CTR"
for kw in top_kw_list[:10]
)
dist = summary.get("keyword_distribution", {})
prompt = f"""Analyze this Google Search Console data and suggest blog topics the user should write about.
opp_str = ""
if content_opportunities:
opp_str = "\nCONTENT OPPORTUNITIES (rule-based findings):\n" + "\n".join(
f"{o['keyword']}: {o['opportunity']}"
for o in content_opportunities[:5]
)
else:
opp_str = "\nNo major content opportunities detected from rule-based analysis."
USER'S TOPIC INTENT: "{user_keywords}"
qw_str = ""
if quick_wins:
qw_str = "\nQUICK WINS (already on page 1, easy to optimize):\n" + "\n".join(
f"{q['keyword']}: position #{q['position']:.0f}, {q['current_ctr']:.1f}% CTR, est. +{q['estimated_traffic_gain']} clicks/month"
for q in quick_wins[:3]
)
SEARCH PERFORMANCE SUMMARY:
- Total Keywords Tracked: {summary.get('total_keywords_analyzed', 0)}
prompt = f"""You are an expert SEO content strategist analyzing real Google Search Console data for a blog writer.
The user wants to write about: "{user_keywords}"
Here is their GSC data for the last 30 days:
PERFORMANCE OVERVIEW:
- Total Keywords: {summary.get('total_keywords_analyzed', 0)}
- Total Impressions: {summary.get('total_impressions', 0):,}
- Total Clicks: {summary.get('total_clicks', 0):,}
- Average CTR: {summary.get('avg_ctr', 0):.2f}%
- Average CTR: {summary.get('avg_ctr', 0):.2f}% (industry avg for positions 1-10 is ~3.1%)
- Average Position: {summary.get('avg_position', 0):.1f}
- SEO Health Score: {summary.get('health_score', 0)}/100
TOP PERFORMING KEYWORDS:
{top_kw}
TOP KEYWORDS BY IMPRESSIONS:
{top_kw_str}
KEYWORD POSITION DISTRIBUTION:
- Positions 1-3: {dist.get('positions_1_3', 0)}
- Positions 4-10: {dist.get('positions_4_10', 0)}
- Positions 11-20: {dist.get('positions_11_20', 0)}
- Positions 21+: {dist.get('positions_21_plus', 0)}
- Position 1-3 (top results): {dist.get('positions_1_3', 0)} keywords
- Position 4-10 (page 1): {dist.get('positions_4_10', 0)} keywords
- Position 11-20 (page 2): {dist.get('positions_11_20', 0)} keywords
- Position 21+ (page 3+): {dist.get('positions_21_plus', 0)} keywords
{opp_str}
{qw_str}
Based on this data, provide:
Based on this data, provide EXACT blog post suggestions the user should write.
1. IMMEDIATE TOPIC OPPORTUNITIES (0-30 days):
- Specific blog post titles the user should write
- Each tied to a keyword opportunity from the data
- 3-5 suggestions
For each suggestion include:
1. A specific, compelling blog post TITLE (not vague topic)
2. The keyword it targets and why (based on the data above)
3. The recommended content format (how-to, listicle, comparison, etc.)
4. Estimated impact (how many more clicks/month they could gain)
2. CONTENT STRATEGY TOPICS (1-3 months):
- New topic clusters to build authority
- Content pillar ideas
- 3-5 suggestions
3. LONG-TERM CONTENT VISION (3-12 months):
- Market expansion topics
- Authority-building content ideas
- 3-5 suggestions
IMPORTANT: Relate every topic suggestion to the user's interest in "{user_keywords}".
Return your response in this exact JSON format:
Return your response in this EXACT JSON format (no markdown, no code fences):
{{
"immediate_opportunities": ["topic 1", "topic 2", "topic 3"],
"content_strategy": ["strategy 1", "strategy 2", "strategy 3"],
"long_term_strategy": ["vision 1", "vision 2", "vision 3"]
}}"""
"immediate_opportunities": [
{{
"title": "Specific Blog Post Title Here",
"keyword": "target keyword",
"reason": "Why this will work based on the data",
"format": "How-To Guide | Listicle | Comparison | Explainer | etc.",
"estimated_impact": "Estimated X more clicks/month"
}}
],
"content_strategy": [
{{
"title": "Pillar Content Title",
"keyword": "target keyword",
"reason": "Strategic reasoning",
"format": "Content format",
"estimated_impact": "Expected impact"
}}
],
"long_term_strategy": [
{{
"title": "Authority Building Title",
"keyword": "target keyword",
"reason": "Long-term reasoning",
"format": "Content format",
"estimated_impact": "Expected long-term impact"
}}
]
}}
IMPORTANT:
- Provide 3-5 items in each category
- Every suggestion MUST relate to the user's interest in "{user_keywords}"
- Titles should be specific and compelling, like real blog post headlines
- Use the data above to justify each recommendation
- Prioritize keywords with high impressions but low CTR or low position"""
system_prompt = (
"You are an enterprise SEO content strategist. Provide specific, data-driven "
"blog topic suggestions that will improve the user's search performance. "
"Always respond with valid JSON matching the requested format."
"You are an expert SEO content strategist. You analyze Google Search Console data "
"and provide specific, actionable blog post recommendations that will drive real traffic. "
"You always respond with valid JSON matching the requested format. "
"Every recommendation must be backed by the data provided."
)
result = llm_text_gen(
@@ -350,27 +536,58 @@ Return your response in this exact JSON format:
if parsed:
return parsed
return self._fallback_ai_recommendations(keywords_data)
return self._fallback_ai_recommendations(keywords_data, content_opportunities, quick_wins)
except Exception as e:
logger.warning(f"GSC brainstorm AI recommendations failed: {e}")
return self._fallback_ai_recommendations(keywords_data)
return self._fallback_ai_recommendations(keywords_data, content_opportunities, quick_wins)
@staticmethod
def _parse_ai_response(raw: str) -> Optional[Dict[str, List[str]]]:
def _parse_ai_response(self, raw: str) -> Optional[Dict[str, Any]]:
try:
json_start = raw.find("{")
json_end = raw.rfind("}") + 1
# Strip markdown code fences if present
cleaned = raw.strip()
if cleaned.startswith("```"):
first_newline = cleaned.find("\n")
if first_newline != -1:
cleaned = cleaned[first_newline + 1:]
if cleaned.endswith("```"):
cleaned = cleaned[:-3].strip()
json_start = cleaned.find("{")
json_end = cleaned.rfind("}") + 1
if json_start == -1 or json_end == 0:
return None
chunk = raw[json_start:json_end]
chunk = cleaned[json_start:json_end]
parsed = json.loads(chunk)
def normalize_section(section: Any) -> List[Dict[str, str]]:
if not isinstance(section, list):
return []
result = []
for item in section:
if isinstance(item, str):
result.append({
"title": item.split(":")[0].strip() if ":" in item else item[:60],
"keyword": "",
"reason": item,
"format": "",
"estimated_impact": "",
})
elif isinstance(item, dict):
result.append({
"title": str(item.get("title", "")),
"keyword": str(item.get("keyword", "")),
"reason": str(item.get("reason", "")),
"format": str(item.get("format", "")),
"estimated_impact": str(item.get("estimated_impact", "")),
})
return result
return {
"immediate_opportunities": parsed.get("immediate_opportunities", [])[:5],
"content_strategy": parsed.get("content_strategy", [])[:5],
"long_term_strategy": parsed.get("long_term_strategy", [])[:5],
"immediate_opportunities": normalize_section(parsed.get("immediate_opportunities", []))[:5],
"content_strategy": normalize_section(parsed.get("content_strategy", []))[:5],
"long_term_strategy": normalize_section(parsed.get("long_term_strategy", []))[:5],
}
except (json.JSONDecodeError, ValueError) as e:
logger.warning(f"Failed to parse AI brainstorm response as JSON: {e}")
@@ -379,26 +596,53 @@ Return your response in this exact JSON format:
@staticmethod
def _fallback_ai_recommendations(
keywords_data: List[Dict],
content_opportunities: List[Dict],
quick_wins: List[Dict],
) -> Dict[str, Any]:
top_kw = keywords_data[:3] if keywords_data else []
immediate = []
for kw in top_kw:
immediate.append(
f"Write a comprehensive guide on '{kw['keyword']}' "
f"(currently at position {kw['position']:.1f} with "
f"{kw['impressions']} impressions)"
)
# Build from quick wins first (highest ROI)
for qw in quick_wins[:2]:
immediate.append({
"title": f"How to Rank #{int(qw['position'])} for '{qw['keyword']}' — Optimization Guide",
"keyword": qw["keyword"],
"reason": qw.get("reason", f"Already on page 1 at position {qw['position']:.0f}"),
"format": "How-To Guide",
"estimated_impact": f"+{qw.get('estimated_traffic_gain', 10)} clicks/month",
})
# Then from content opportunities
for opp in content_opportunities[:2]:
immediate.append({
"title": f"Complete Guide to {opp['keyword'].title()}",
"keyword": opp["keyword"],
"reason": opp.get("opportunity", f"{opp['impressions']:,} impressions with room to improve"),
"format": opp.get("suggested_format", "In-Depth Article"),
"estimated_impact": f"+{opp.get('estimated_traffic_gain', 10)} clicks/month",
})
# Fill remaining with top keywords
remaining = 5 - len(immediate)
for kw in top_kw[:remaining]:
immediate.append({
"title": f"The Ultimate Guide to {kw['keyword'].title()}",
"keyword": kw["keyword"],
"reason": f"Top keyword with {kw['impressions']:,} impressions (position {kw['position']:.1f})",
"format": "In-Depth Article",
"estimated_impact": f"+{max(int(kw['impressions'] * 0.03), 5)} clicks/month",
})
return {
"immediate_opportunities": immediate or ["No keyword data available for recommendations"],
"immediate_opportunities": immediate or [{"title": "No keyword data available", "keyword": "", "reason": "Connect GSC to get personalized suggestions", "format": "", "estimated_impact": ""}],
"content_strategy": [
"Develop topic clusters around your top-performing keywords",
"Create comparison and vs-style content for competitive terms",
"Build FAQ sections targeting question-based queries",
{"title": "Topic Cluster: Build Authority Around Your Core Topics", "keyword": "", "reason": "Clustered content ranks higher and captures more long-tail queries", "format": "Pillar Page + Spokes", "estimated_impact": "+50-200 clicks/month over 3 months"},
{"title": "Comparison Guide: Your Product vs. Alternatives", "keyword": "", "reason": "Comparison content captures high-intent searchers ready to decide", "format": "Comparison", "estimated_impact": "+20-80 clicks/month"},
{"title": "FAQ: Answer What Your Audience Is Asking", "keyword": "", "reason": "FAQs capture featured snippets and voice search queries", "format": "FAQ / Listicle", "estimated_impact": "+30-100 clicks/month"},
],
"long_term_strategy": [
"Build domain authority through pillar content",
"Expand into adjacent topic areas",
"Develop thought leadership content series",
{"title": "Pillar Content: The Definitive Resource in Your Niche", "keyword": "", "reason": "Comprehensive guides become authoritative references that attract backlinks", "format": "Long-Form Guide", "estimated_impact": "+100-500 clicks/month over 6-12 months"},
{"title": "Trend Report: What's Next in Your Industry", "keyword": "", "reason": "Forward-looking content captures emerging search demand early", "format": "Trend Report", "estimated_impact": "+50-200 clicks/month"},
{"title": "Thought Leadership: Expert Roundup and Insights", "keyword": "", "reason": "Expert content builds E-E-A-T signals that improve overall domain authority", "format": "Expert Roundup", "estimated_impact": "+30-100 clicks/month per piece"},
],
}

View File

@@ -250,10 +250,10 @@ class GSCService:
flow = Flow.from_client_config(
self.client_config,
scopes=self.scopes,
redirect_uri=redirect_uri
redirect_uri=redirect_uri,
autogenerate_code_verifier=False,
)
# Use a custom state that includes user_id for routing the callback to the correct DB
random_state = secrets.token_urlsafe(32)
state = f"{user_id}:{random_state}"
@@ -300,7 +300,7 @@ class GSCService:
logger.error(f"User database not found for user {user_id}")
return False
# Verify state in user's DB
# Verify state in user's DB (but don't delete yet — delete after successful token exchange)
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
cursor.execute('SELECT user_id FROM gsc_oauth_states WHERE state = ?', (state,))
@@ -309,10 +309,6 @@ class GSCService:
if not result:
logger.error(f"Invalid or expired GSC OAuth state for user {user_id}")
return False
# Clean up state
cursor.execute('DELETE FROM gsc_oauth_states WHERE state = ?', (state,))
conn.commit()
# Exchange code for credentials
if not self.client_config:
@@ -322,12 +318,22 @@ class GSCService:
flow = Flow.from_client_config(
self.client_config,
scopes=self.scopes,
redirect_uri=os.getenv('GSC_REDIRECT_URI', 'http://localhost:8000/gsc/callback')
redirect_uri=os.getenv('GSC_REDIRECT_URI', 'http://localhost:8000/gsc/callback'),
autogenerate_code_verifier=False,
)
flow.fetch_token(code=authorization_code)
credentials = flow.credentials
# State consumed successfully — clean up
try:
with sqlite3.connect(db_path) as conn:
cursor = conn.cursor()
cursor.execute('DELETE FROM gsc_oauth_states WHERE state = ?', (state,))
conn.commit()
except Exception as cleanup_err:
logger.warning(f"Failed to clean up OAuth state: {cleanup_err}")
# Save credentials
return self.save_user_credentials(user_id, credentials)

View File

@@ -343,18 +343,28 @@ class HallucinationDetector:
logger.error(f"Error in batch evidence search: {str(e)}")
return []
def _map_source_refs_from_reasoning(self, reasoning: str, sources: List[Dict[str, Any]]) -> List[int]:
"""Parse 'Source N' references from reasoning text and return 0-based indices."""
import re
indices = set()
for match in re.finditer(r'Source\s+(\d+)', reasoning):
ref = int(match.group(1))
if 1 <= ref <= len(sources):
indices.add(ref - 1) # convert 1-based → 0-based
return sorted(indices)
async def _assess_claims_batch(self, claims: List[str], sources: List[Dict[str, Any]], user_id: str = None) -> List[Claim]:
"""Assess multiple claims against sources in one LLM call."""
try:
claims_to_assess = claims[:3]
combined_sources = "\n\n".join([
f"Source {i+1}: {src.get('url','')}\nText: {src.get('text','')[:1000]}"
f"Source [{i}]: {src.get('url','')}\nText: {src.get('text','')[:1000]}"
for i, src in enumerate(sources)
])
claims_text = "\n".join([
f"Claim {i+1}: {claim}"
f"Claim {i}: {claim}"
for i, claim in enumerate(claims_to_assess)
])
@@ -367,12 +377,14 @@ class HallucinationDetector:
' "claim_index": 0,\n'
' "assessment": "supported" or "refuted" or "insufficient_information",\n'
' "confidence": number between 0.0 and 1.0,\n'
' "supporting_sources": [array of source indices that support the claim],\n'
' "refuting_sources": [array of source indices that refute the claim],\n'
' "supporting_sources": [array of 0-based source indices, e.g. [0, 2] for Source [0] and Source [2]],\n'
' "refuting_sources": [array of 0-based source indices, e.g. [1] for Source [1]],\n'
' "reasoning": "brief explanation of your assessment"\n'
' }\n'
' ]\n'
"}\n\n"
"IMPORTANT: Source indices are 0-based. Source [0] is the first source, Source [1] is the second, etc.\n"
"For every 'supported' or 'refuted' claim you MUST include the relevant source indices.\n\n"
f"Claims to verify:\n{claims_text}\n\n"
f"Sources:\n{combined_sources}\n\n"
"Return only the JSON object:"
@@ -407,6 +419,15 @@ class HallucinationDetector:
if isinstance(idx, int) and 0 <= idx < len(sources):
refuting_sources.append(sources[idx])
# Fallback: parse "Source N" from reasoning text when LLM omits indices
if not supporting_sources and not refuting_sources and sources and assessment.get('reasoning'):
ref_indices = self._map_source_refs_from_reasoning(assessment.get('reasoning', ''), sources)
if ref_indices:
if assessment.get('assessment') == 'supported':
supporting_sources = [sources[i] for i in ref_indices]
elif assessment.get('assessment') == 'refuted':
refuting_sources = [sources[i] for i in ref_indices]
verified_claims.append(Claim(
text=claim,
confidence=float(assessment.get('confidence', 0.5)),
@@ -464,7 +485,7 @@ class HallucinationDetector:
"""Assess whether sources support or refute the claim using LLM."""
try:
combined_sources = "\n\n".join([
f"Source {i+1}: {src.get('url','')}\nText: {src.get('text','')[:2000]}"
f"Source [{i}]: {src.get('url','')}\nText: {src.get('text','')[:2000]}"
for i, src in enumerate(sources)
])
@@ -474,10 +495,12 @@ class HallucinationDetector:
"{\n"
' "assessment": "supported" or "refuted" or "insufficient_information",\n'
' "confidence": number between 0.0 and 1.0,\n'
' "supporting_sources": [array of source indices that support the claim],\n'
' "refuting_sources": [array of source indices that refute the claim],\n'
' "supporting_sources": [array of 0-based source indices, e.g. [0, 2] for Source [0] and Source [2]],\n'
' "refuting_sources": [array of 0-based source indices, e.g. [1] for Source [1]],\n'
' "reasoning": "brief explanation of your assessment"\n'
"}\n\n"
"IMPORTANT: Source indices are 0-based. Source [0] is the first source, Source [1] is the second, etc.\n"
"For 'supported' or 'refuted' you MUST include the relevant source indices.\n\n"
f"Claim to verify: {claim}\n\n"
f"Sources:\n{combined_sources}\n\n"
"Return only the JSON object:"
@@ -508,6 +531,15 @@ class HallucinationDetector:
if isinstance(idx, int) and 0 <= idx < len(sources):
refuting_sources.append(sources[idx])
# Fallback: parse "Source N" from reasoning text when LLM omits indices
if not supporting_sources and not refuting_sources and sources and result.get('reasoning'):
ref_indices = self._map_source_refs_from_reasoning(result.get('reasoning', ''), sources)
if ref_indices:
if result.get('assessment') == 'supported':
supporting_sources = [sources[i] for i in ref_indices]
elif result.get('assessment') == 'refuted':
refuting_sources = [sources[i] for i in ref_indices]
# Validate assessment value
valid_assessments = ['supported', 'refuted', 'insufficient_information']
if result['assessment'] not in valid_assessments:

View File

@@ -46,6 +46,7 @@ def llm_text_gen(
preferred_provider: Optional[str] = None,
flow_type: Optional[str] = None,
max_tokens: Optional[int] = None,
temperature: Optional[float] = None,
) -> str:
"""
Generate text using Language Model (LLM) based on the provided prompt.
@@ -58,6 +59,8 @@ def llm_text_gen(
preferred_hf_models (list, optional): Preferred HuggingFace models.
preferred_provider (str, optional): Preferred provider (google, huggingface).
flow_type (str, optional): Flow type for logging (e.g., 'sif_agent', 'premium_tool').
max_tokens (int, optional): Max tokens for response. If None, provider default is used.
temperature (float, optional): Temperature for generation (0.0-1.0). If None, defaults to 0.7.
Returns:
str: Generated text based on the prompt.
@@ -75,9 +78,8 @@ def llm_text_gen(
# Set default values for LLM parameters
gpt_provider = "google" # Default to Google Gemini
model = "gemini-2.0-flash-001"
temperature = 0.7
if max_tokens is None:
max_tokens = 4000
if temperature is None:
temperature = 0.7
top_p = 0.9
n = 1
fp = 16

View File

@@ -1,6 +1,7 @@
import os
import re
import asyncio
from typing import Any, Dict, List
from typing import Any, Dict, List, Optional
from dataclasses import dataclass
from loguru import logger
import random
@@ -17,42 +18,33 @@ class WritingSuggestion:
class WritingAssistantService:
"""
Minimal writing assistant that combines Exa search with Gemini continuation.
- Exa provides relevant sources with content snippets
- Gemini generates a short, cited continuation based on current text and sources
Writing assistant that combines Exa search with LLM continuation.
- Searches relevant sources using the content near the cursor position
- Generates a short continuation grounded in sources
- Confidence derived from source availability and quality
"""
def __init__(self) -> None:
# COST CONTROL: Daily usage limits
self.daily_api_calls = 0
self.daily_limit = 50 # Max 50 API calls per day (~$2.50 max cost)
self.daily_limit = 50
self.last_reset_date = None
def _get_cached_suggestion(self, text: str) -> WritingSuggestion | None:
"""No cached suggestions - always use real API calls for authentic results."""
return None
def _check_daily_limit(self) -> bool:
"""Check if we're within daily API usage limits."""
import datetime
today = datetime.date.today()
# Reset counter if it's a new day
if self.last_reset_date != today:
self.daily_api_calls = 0
self.last_reset_date = today
# Check if we've exceeded the limit
if self.daily_api_calls >= self.daily_limit:
return False
# Increment counter for this API call
self.daily_api_calls += 1
logger.info(f"Writing assistant API call #{self.daily_api_calls}/{self.daily_limit} today")
return True
async def suggest(self, text: str, user_id: str | None = None) -> List[WritingSuggestion]:
async def suggest(self, text: str, user_id: str | None = None, cursor_position: Optional[int] = None) -> List[WritingSuggestion]:
if not text or len(text.strip()) < 6:
return []
@@ -67,26 +59,41 @@ class WritingAssistantService:
if len(text.strip()) < 50:
return []
# 1) Find relevant sources via Exa
sources = await self._search_sources(text, user_id=user_id)
# Use text before cursor for context (where the user is actively writing)
if cursor_position is not None and 0 < cursor_position <= len(text):
context_text = text[:cursor_position]
else:
context_text = text
# 2) Generate continuation suggestion via LLM grounded in sources
suggestion_text, confidence = await self._generate_continuation(text, sources, user_id=user_id)
# 1) Find relevant sources via Exa (non-fatal)
sources = []
try:
sources = await self._search_sources(context_text, user_id=user_id)
except Exception as e:
logger.warning(f"WritingAssistant Exa search failed, proceeding without sources: {e}")
# 2) Generate continuation suggestion via LLM
suggestion_text, confidence = await self._generate_continuation(context_text, sources, user_id=user_id)
if not suggestion_text:
return []
return [WritingSuggestion(text=suggestion_text.strip(), confidence=confidence, sources=sources)]
async def _search_sources(self, text: str, user_id: str = None) -> List[Dict[str, Any]]:
"""Search for relevant sources using ExaResearchProvider with subscription checks."""
async def _search_sources(self, context_text: str, user_id: str = None) -> List[Dict[str, Any]]:
"""Search Exa using the last sentence before cursor for a focused query."""
try:
from services.blog_writer.research.exa_provider import ExaResearchProvider
exa_query = (
(text[-1000:] if len(text) > 1000 else text)
+ "\n\nIf you found the above interesting, here's another useful resource to read:"
)
# Extract the last sentence from context to use as a focused search query
sentences = re.split(r'(?<=[.!?])\s+', context_text.strip())
last_sentence = sentences[-1].strip().strip('"').strip("'") if sentences else context_text
# If very short, use last two sentences
if len(last_sentence) < 20 and len(sentences) >= 2:
last_sentence = ' '.join(s[-2:]).strip().strip('"').strip("'")
exa_query = last_sentence[:500] if len(last_sentence) > 500 else last_sentence
provider = ExaResearchProvider()
sources = await provider.simple_search(
@@ -95,7 +102,6 @@ class WritingAssistantService:
user_id=user_id,
)
# Normalize keys to match expected format
normalized = []
for s in sources:
normalized.append({
@@ -104,7 +110,7 @@ class WritingAssistantService:
"text": s.get("text", ""),
"author": s.get("author", ""),
"published_date": s.get("publishedDate", ""),
"score": float(s.get("score", 0.5)),
"score": float(s.get("score") if s.get("score") is not None else 0.5),
})
if not normalized:
@@ -151,8 +157,21 @@ class WritingAssistantService:
suggestion = (str(ai_resp or "")).strip()
if not suggestion:
raise Exception("Assistive writer returned empty suggestion")
confidence = 0.7
return suggestion, confidence
# Dynamic confidence based on source quality and response signals
confidence = 0.5
if sources:
# More sources and higher scores = more confident
avg_score = sum(s.get("score", 0.5) for s in sources) / len(sources)
confidence = 0.5 + (len(sources) / 6.0) * 0.3 + avg_score * 0.2
if suggestion.endswith(('.', '!', '?')):
confidence += 0.05
# Check if citation hint was included
if '[http' in suggestion or '((' in suggestion:
confidence += 0.05
confidence = min(confidence, 1.0)
return suggestion, round(confidence, 2)
except Exception as e:
logger.error(f"WritingAssistant _generate_continuation error: {e}")
raise