Files
ALwrity/backend/api/podcast/handlers/research.py
ajaysi 63bb937796 feat: podcast demo mode with ALWRITY_ENABLED_FEATURES support
- Add ALWRITY_ENABLED_FEATURES env var for feature gating
- Podcast-only mode: skip LLM bootstrap, scheduler, persona services
- Enhance video generation prompt with scene context, analysis, narration
- Add voice cloning support via custom_voice_id in WaveSpeed
- Add text-to-speech for research results (browser speechSynthesis)
- Fix render queue to sync images from script phase
- Add WaveSpeed LLM pricing (gpt-oss-120b)
- Fix podcast bible generation error handling
- Refactor RouterManager for feature-based router loading
2026-04-03 06:59:59 +05:30

303 lines
12 KiB
Python

"""
Podcast Research Handlers
Research endpoints using Exa provider and LLM summarization.
"""
from fastapi import APIRouter, Depends, HTTPException
from typing import Dict, Any, List
from types import SimpleNamespace
import json
import re
from middleware.auth_middleware import get_current_user
from api.story_writer.utils.auth import require_authenticated_user
from services.blog_writer.research.exa_provider import ExaResearchProvider
from services.llm_providers.main_text_generation import llm_text_gen
from services.podcast_bible_service import PodcastBibleService
from loguru import logger
from ..models import (
PodcastExaResearchRequest,
PodcastExaResearchResponse,
PodcastExaSource,
PodcastExaConfig,
PodcastResearchInsight,
)
router = APIRouter()
@router.post("/research/exa", response_model=PodcastExaResearchResponse)
async def podcast_research_exa(
request: PodcastExaResearchRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
):
"""
Run podcast research via Exa and then use LLM to extract deep insights.
Uses Podcast Bible and Analysis context for hyper-personalization.
"""
user_id = require_authenticated_user(current_user)
logger.warning(f"[Podcast Research] ========== REQUEST START ==========")
logger.warning(f"[Podcast Research] User: {user_id}, Topic: {request.topic[:80]}...")
logger.warning(f"[Podcast Research] Queries count: {len(request.queries) if request.queries else 0}")
queries = [q.strip() for q in request.queries if q and q.strip()]
if not queries:
raise HTTPException(status_code=400, detail="At least one query is required for research.")
logger.warning(f"[Podcast Research] EXACT queries being sent to Exa: {queries}")
exa_cfg = request.exa_config or PodcastExaConfig()
cfg = SimpleNamespace(
exa_search_type=exa_cfg.exa_search_type or "auto",
exa_category=exa_cfg.exa_category,
exa_include_domains=exa_cfg.exa_include_domains or [],
exa_exclude_domains=exa_cfg.exa_exclude_domains or [],
max_sources=exa_cfg.max_sources or 8,
source_types=[],
)
provider = ExaResearchProvider()
logger.warning(f"[Podcast Research] Provider initialized, starting Exa search...")
# --- Context Building ---
bible_service = PodcastBibleService()
bible_context = ""
if request.bible:
try:
from models.podcast_bible_models import PodcastBible
bible_data = PodcastBible(**request.bible)
bible_context = bible_service.serialize_bible(bible_data)
except Exception as exc:
logger.warning(f"[Podcast Research] Failed to serialize bible: {exc}")
analysis_context = ""
if request.analysis:
analysis_context = f"""
PODCAST ANALYSIS CONTEXT:
========================
Topic: {request.topic}
Target Audience: {request.analysis.get('audience', 'General')}
Content Type: {request.analysis.get('content_type', 'Informative')}
Top Keywords: {', '.join(request.analysis.get('top_keywords', []))}
Episode Hook (Intro): {request.analysis.get('episode_hook', 'N/A')}
Key Takeaways: {', '.join(request.analysis.get('key_takeaways', [])) or 'N/A'}
Guest Talking Points: {', '.join(request.analysis.get('guest_talking_points', [])) or 'N/A'}
Listener CTA: {request.analysis.get('listener_cta', 'N/A')}
"""
# Exa search params
industry = request.bible.get("brand", {}).get("industry", "") if request.bible else ""
target_audience = ""
if request.bible:
audience_dna = request.bible.get("audience", {})
if audience_dna:
interests = ", ".join(audience_dna.get("interests", []))
target_audience = f"Expertise: {audience_dna.get('expertise_level', '')}. Interests: {interests}."
try:
# 1. RUN EXA SEARCH
logger.warning(f"[Podcast Research] Calling Exa search with topic: {request.topic[:100]}...")
result = await provider.search(
prompt=request.topic,
topic=request.topic,
industry=industry,
target_audience=target_audience,
config=cfg,
user_id=user_id,
)
logger.warning(f"[Podcast Research] Exa search completed, got {len(result.get('sources', []))} sources")
except Exception as exc:
logger.error(f"[Podcast Exa Research] Search failed for user {user_id}: {exc}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Exa research failed: {exc}")
# 2. EXTRACT INSIGHTS VIA LLM
raw_content = result.get("content", "")
sources = result.get("sources", [])
summary = ""
key_insights = []
if raw_content and sources:
logger.warning(f"[Podcast Research] Extracting insights from {len(sources)} sources for user {user_id}")
# Build list of research queries used for this search
queries_used = ", ".join([f"Query {i+1}: {q}" for i, q in enumerate(queries)]) if queries else "No specific queries"
prompt = f"""
You are an expert research analyst and content strategist for a high-end podcast production team.
Your task is to analyze the research data and extract deep, podcast-ready insights.
PODCAST CONTEXT:
================
Main Topic: {request.topic}
RESEARCH QUERIES USED:
=====================
{queries_used}
PODCAST BIBLE & BRAND CONTEXT:
==============================
{bible_context}
PODCAST ANALYSIS (from AI Analysis phase):
==========================================
{analysis_context}
RESEARCH DATA (from {len(sources)} sources):
============================================
{raw_content}
YOUR TASK:
==========
As a podcast research expert, analyze this data and create content that will:
1. Engage the specific target audience identified above
2. Support the episode hook and key takeaways already planned
3. Provide talking points that complement the guest's expertise
4. Include a compelling call-to-action for listeners
REQUIRED OUTPUT (JSON):
=======================
{{
"summary": "2-3 paragraph comprehensive summary in Markdown. Start with a hook that matches the episode intro. Include specific data points, expert quotes, and trends.",
"key_insights": [
{{
"title": "Catchy, engaging title for this insight",
"content": "3-4 sentences with specific facts, quotes, or data. Write in a conversational tone suitable for a podcast host to discuss.",
"source_indices": [1, 2, 3],
"podcast_talking_points": ["Point 1 host can expand on", "Counter-point or follow-up", "Question to ask guest"]
}}
],
"expert_quotes": [
{{
"quote": "Direct quote from source",
"source_index": 1,
"context": "Why this quote matters for the podcast"
}}
],
"listener_cta_suggestions": ["Specific action listener can take", "Resource to share", "Next episode preview"]
}}
QUALITY STANDARDS:
==================
- INSIGHTS MUST BE DEEP, not superficial - avoid generic statements
- Include SPECIFIC DATA POINTS, percentages, statistics when available
- Extract EXPERT QUOTES that hosts can reference
- Identify GAPS in the research where more depth is needed
- Make content naturally flow into the planned episode hook and CTA
- Write in a CONVERSATIONAL tone - how a host would actually speak
- Flag any CONTROVERSIAL or debatable claims for host to address
"""
try:
logger.warning(f"[Podcast Research] Calling LLM for insight extraction...")
llm_response = llm_text_gen(
prompt=prompt,
user_id=user_id,
json_struct=None,
preferred_provider=None,
flow_type="premium_tool",
)
logger.warning(f"[Podcast Research] LLM response received, length: {len(llm_response) if llm_response else 0}")
# Normalize response - handle both string and dict responses
data = None
if isinstance(llm_response, str):
try:
# Try to fix common JSON issues
fixed_response = llm_response.strip()
# Remove markdown code blocks if present
if fixed_response.startswith("```"):
fixed_response = fixed_response.split("```")[1]
if fixed_response.startswith("json"):
fixed_response = fixed_response[4:]
fixed_response = fixed_response.strip()
data = json.loads(fixed_response)
except json.JSONDecodeError as json_err:
logger.warning(f"[Podcast Research] Failed to parse JSON: {json_err}. Response preview: {llm_response[:500]}...")
# Try to extract JSON from response using regex
json_match = re.search(r'\{.*\}', llm_response, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group())
logger.warning("[Podcast Research] Successfully extracted JSON via regex")
except:
pass
else:
data = llm_response
if data:
try:
summary = data.get("summary", "")
key_insights = [PodcastResearchInsight(**insight) for insight in data.get("key_insights", [])]
except Exception as insight_err:
logger.warning(f"[Podcast Research] Failed to parse insights: {insight_err}. Data keys: {list(data.keys()) if isinstance(data, dict) else 'not a dict'}")
summary = data.get("summary", "") if isinstance(data, dict) else ""
key_insights = []
else:
summary = ""
key_insights = []
except HTTPException:
raise
except Exception as exc:
logger.error(f"[Podcast Research] LLM Insight extraction failed: {exc}")
raise HTTPException(status_code=500, detail=f"Research insight extraction failed: {exc}")
# Fallback: if summary is still empty (e.g. LLM returned empty string), use raw content first paragraph or basic text
if not summary:
if raw_content:
summary = raw_content[:2000] # Use first 2000 chars of raw content as summary
else:
summary = f"Research completed for '{request.topic}'. Found {len(sources)} sources."
# 3. TRACK USAGE
try:
cost_total = 0.0
if isinstance(result, dict):
cost_total = result.get("cost", {}).get("total", 0.005) if result.get("cost") else 0.005
provider.track_exa_usage(user_id, cost_total)
except Exception as track_err:
logger.warning(f"[Podcast Exa Research] Failed to track usage: {track_err}")
sources_payload = []
seen_urls = set()
for src in sources:
url = src.get("url", "")
# Skip duplicates
if url and url in seen_urls:
continue
if url:
seen_urls.add(url)
try:
sources_payload.append(PodcastExaSource(**src))
except Exception:
sources_payload.append(PodcastExaSource(**{
"title": src.get("title", ""),
"url": url,
"excerpt": src.get("excerpt") or (src.get("highlights")[0] if src.get("highlights") else "") or src.get("summary", ""),
"published_at": src.get("published_at"),
"publishedDate": src.get("publishedDate"),
"highlights": src.get("highlights"),
"summary": src.get("summary"),
"source_type": src.get("source_type"),
"index": src.get("index"),
"image": src.get("image"),
"author": src.get("author"),
"text": src.get("text"),
"credibility_score": src.get("credibility_score"),
}))
return PodcastExaResearchResponse(
sources=sources_payload,
search_queries=result.get("search_queries", queries) if isinstance(result, dict) else queries,
summary=summary,
key_insights=key_insights,
cost=result.get("cost") if isinstance(result, dict) else None,
search_type=result.get("search_type") if isinstance(result, dict) else None,
provider=result.get("provider", "exa") if isinstance(result, dict) else "exa",
content=raw_content,
)