Files
ALwrity/backend/api/podcast/handlers/script.py
ajaysi 91b2f996fd feat: voice clone audio generation + podcast workspace architecture
- Voice clone integration: When user selects voice clone in Write phase,
  backend uses their uploaded voice sample + scene script text to generate
  audio via qwen3/minimax/cosyvoice voice clone APIs
- Multi-tenant workspace storage: All podcast assets (audio, video, images,
  charts) now use workspace-specific directories per user
- Chart preview improvements: Card-based B-Roll charts UI with thumbnails,
  takeaway text, and action buttons; public endpoint for image serving
- Voice clone caching: In-memory LRU cache for voice samples (avoids
  re-downloading per scene); frontend caches voice clone metadata
- Thread pool for voice clone: Audio generation uses ThreadPoolExecutor to
  avoid blocking the FastAPI event loop
- Auto-detect voice clone IDs (vc_*, MY_VOICE_CLONE) to route correctly
- DB fallback for voice sample URL: Fetches from ContentAsset if not passed
- Fixed API URL resolution for chart previews
- Fixed GlassyCard DOM warnings for motion props
- Fixed ScriptGenerationProgressView syntax error
- Fixed usePodcastWorkflow scriptData reference
2026-04-21 19:38:50 +05:30

408 lines
17 KiB
Python

"""
Podcast Script Handlers
Script generation and approval endpoints.
"""
from fastapi import APIRouter, Depends, HTTPException
from typing import Dict, Any, Optional
from pydantic import BaseModel, Field
import json
import re
import time
from middleware.auth_middleware import get_current_user
from api.story_writer.utils.auth import require_authenticated_user
from services.llm_providers.main_text_generation import llm_text_gen
from services.podcast_bible_service import PodcastBibleService
from models.podcast_bible_models import PodcastBible
from loguru import logger
from ..models import (
PodcastScriptRequest,
PodcastScriptResponse,
PodcastScene,
PodcastSceneLine,
)
router = APIRouter()
MAX_TTS_CHARS_PER_REQUEST = 10_000
TARGET_TTS_CHARS_PER_SCENE = 8_500
class SceneApprovalRequest(BaseModel):
project_id: str = Field(..., min_length=1)
scene_id: str = Field(..., min_length=1)
approved: bool = True
notes: Optional[str] = None
@router.post("/script/approve")
async def approve_podcast_scene(
request: SceneApprovalRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> Dict[str, Any]:
"""Persist scene approval metadata for auditing (podcast-specific)."""
user_id = require_authenticated_user(current_user)
logger.warning(f"[Podcast] Scene approval recorded user={user_id} project={request.project_id} scene={request.scene_id} approved={request.approved}")
return {
"success": True,
"project_id": request.project_id,
"scene_id": request.scene_id,
"approved": request.approved,
}
@router.post("/script", response_model=PodcastScriptResponse)
async def generate_podcast_script(
request: PodcastScriptRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
):
"""
Generate a podcast script outline (scenes + lines) using podcast-oriented prompting.
"""
user_id = require_authenticated_user(current_user)
start_time = time.time()
logger.warning(f"[ScriptGen] ===== SCRIPT_GEN_START =====")
logger.warning(f"[ScriptGen] user={user_id}, topic='{request.idea[:50]}...', duration={request.duration_minutes}min, speakers={request.speakers}")
podcast_mode = (request.podcast_mode or "video_only").strip().lower()
logger.warning(f"[ScriptGen] research={bool(request.research)}, bible={bool(request.bible)}, analysis={bool(request.analysis)}, mode={podcast_mode}")
research_fact_cards = request.research.get("factCards", []) if request.research else []
# Build comprehensive research context for higher-quality scripts
research_context = ""
if request.research:
try:
key_insights = request.research.get("keyword_analysis", {}).get("key_insights") or []
fact_cards = research_fact_cards or []
mapped_angles = request.research.get("mappedAngles", []) or []
sources = request.research.get("sources", []) or []
top_facts = [
f"[{f.get('id') or f'fact_{idx + 1}'}] {f.get('quote', '')}"
for idx, f in enumerate(fact_cards[:10])
if f.get("quote")
]
angles_summary = [
f"{a.get('title', '')}: {a.get('why', '')}" for a in mapped_angles[:3] if a.get("title") or a.get("why")
]
top_sources = [s.get("url") for s in sources[:3] if s.get("url")]
numeric_signals = []
for f in fact_cards[:12]:
quote = (f.get("quote") or "").strip()
if any(ch.isdigit() for ch in quote):
numeric_signals.append(quote[:180])
if len(numeric_signals) >= 5:
break
research_parts = []
if key_insights:
research_parts.append(f"Key Insights: {', '.join(key_insights[:5])}")
if top_facts:
research_parts.append(f"Key Facts: {', '.join(top_facts)}")
if numeric_signals:
research_parts.append(f"Numeric Signals (prefer for chart scenes): {' | '.join(numeric_signals)}")
if angles_summary:
research_parts.append(f"Research Angles: {' | '.join(angles_summary)}")
if top_sources:
research_parts.append(f"Top Sources: {', '.join(top_sources)}")
research_context = "\n".join(research_parts)
except Exception as exc:
logger.warning(f"Failed to parse research context: {exc}")
research_context = ""
def _normalize_fact_ids(value: Any) -> Optional[list[str]]:
if not value:
return None
if isinstance(value, list):
cleaned = [str(v).strip() for v in value if str(v).strip()]
return cleaned or None
if isinstance(value, str) and value.strip():
return [value.strip()]
return None
def _default_chart_data(scene_title: str) -> Dict[str, Any]:
numeric_pairs: list[tuple[str, float]] = []
for fact in research_fact_cards[:12]:
quote = (fact.get("quote") or "").strip()
if not quote:
continue
nums = re.findall(r"\d+(?:\.\d+)?", quote.replace(",", ""))
if not nums:
continue
label = quote[:48] + ("" if len(quote) > 48 else "")
try:
numeric_pairs.append((label, float(nums[0])))
except ValueError:
continue
if len(numeric_pairs) >= 5:
break
if numeric_pairs:
labels = [p[0] for p in numeric_pairs]
values = [p[1] for p in numeric_pairs]
return {
"type": "bar_comparison",
"title": scene_title,
"labels": labels,
"values": values,
"takeaway": "Data points sourced from research facts used in this scene.",
}
return {
"type": "bullet",
"title": scene_title,
"bullet_points": ["Key point 1", "Key point 2", "Key point 3"],
"takeaway": "Narration summary for this scene.",
}
# Extract Podcast Bible context for hyper-personalization
bible_context = ""
if request.bible:
try:
bible_service = PodcastBibleService()
bible_obj = PodcastBible(**request.bible)
bible_context = bible_service.serialize_bible(bible_obj)
except Exception as exc:
logger.warning(f"Failed to serialize podcast bible: {exc}")
# Extract Analysis and Outline context for grounding
analysis_context = ""
if request.analysis:
try:
audience = request.analysis.get('audience', '') or ''
content_type = request.analysis.get('contentType', '') or ''
keywords = request.analysis.get('topKeywords', []) or []
analysis_context = f"ANALYSIS: Audience={audience} | Type={content_type} | Keywords={', '.join(keywords[:8])}"
except:
pass
outline_context = ""
if request.outline:
try:
title = request.outline.get('title', '') or ''
segments = request.outline.get('segments', []) or []
outline_context = f"OUTLINE: {title} - {' | '.join(segments[:5])}"
except:
pass
mode_instructions = ""
if podcast_mode == "audio_only":
mode_instructions = f"""
AUDIO-ONLY MODE RULES (CRITICAL):
- This is an audio-only episode. Do NOT include avatar/image/camera instructions.
- Keep each scene's total dialogue under {TARGET_TTS_CHARS_PER_SCENE} chars to stay below TTS max request size ({MAX_TTS_CHARS_PER_REQUEST}).
- For every scene include chart_data so B-roll charts can be generated while narration plays.
- Build script STRICTLY from RESEARCH context and cite fact linkage via usedFactIds.
- If evidence is weak, say uncertainty explicitly rather than inventing facts.
- Add natural TTS pacing in dialogue with markers like [pause:300ms], [pause:700ms], [emote:curious], [emote:serious].
"""
elif podcast_mode == "audio_video":
mode_instructions = """
AUDIO+VIDEO MODE:
- Include rich narration that works for both listening and visual storytelling.
- Use a balanced pace suitable for TTS and scene visuals.
"""
else:
mode_instructions = """
VIDEO-ONLY MODE:
- Prioritize visual rhythm and concise narration per scene.
"""
prompt = f"""Create a podcast script with scenes and dialogue.
{f"BIBLE: {bible_context[:1500]}" if bible_context else ""}
{f"{analysis_context}" if analysis_context else ""}
{f"{outline_context}" if outline_context else ""}
{f"RESEARCH: {research_context[:2500]}" if research_context else ""}
{mode_instructions}
Topic: "{request.idea}"
Duration: {request.duration_minutes} min | Speakers: {request.speakers}
Podcast mode: {podcast_mode}
Return JSON with scenes array. Each scene:
- id: string
- title: short title (<=50 chars)
- duration: seconds (total/5)
- emotion: neutral|happy|excited|serious|curious|confident
- lines: array of {{speaker, text, emphasis, usedFactIds, ttsHints}}
- Use 2-4 LINES PER SCENE (shorter script = lower TTS costs)
- Each line: 1-3 sentences, conversational
- usedFactIds: include related fact ids when research facts are available (example: ["fact_1", "fact_3"])
- ttsHints: optional list from [pause_300ms, pause_700ms, smile, serious_tone, emphasize_data]
- Plain text only, no markdown
- chart_data: object for B-roll mapping (required in audio_only)
- type: bar_comparison|line_trend|bullet_points
- title: short chart title
- labels: list
- values: list (same length as labels)
- takeaway: one sentence tying chart to narration
COST OPTIMIZATION:
- 5-6 scenes max for {request.duration_minutes} min episode
- Concise, information-dense dialogue
- Skip filler words and redundant phrases
- Focus on unique insights from research
- Make every line count toward value delivery
"""
try:
logger.warning(f"[ScriptGen] Calling LLM to generate script (prompt length: {len(prompt)})...")
raw = llm_text_gen(
prompt=prompt,
user_id=user_id,
json_struct=None,
preferred_provider=None,
flow_type="premium_tool",
)
logger.warning(f"[ScriptGen] LLM response received, length: {len(raw) if raw else 0}")
except HTTPException:
raise
except Exception as exc:
raise HTTPException(status_code=500, detail=f"Script generation failed: {exc}")
if isinstance(raw, str):
try:
data = json.loads(raw)
except json.JSONDecodeError:
raise HTTPException(status_code=500, detail="LLM returned non-JSON output")
elif isinstance(raw, dict):
data = raw
else:
raise HTTPException(status_code=500, detail="Unexpected LLM response format")
scenes_data = data.get("scenes") or []
if not isinstance(scenes_data, list):
raise HTTPException(status_code=500, detail="LLM response missing scenes array")
if len(scenes_data) == 0:
logger.warning("[ScriptGen] LLM returned empty scenes array")
raise HTTPException(status_code=500, detail="LLM returned no scenes - please try again")
logger.warning(f"[ScriptGen] Processing {len(scenes_data)} scenes from LLM response")
valid_emotions = {"neutral", "happy", "excited", "serious", "curious", "confident"}
# Normalize scenes
scenes: list[PodcastScene] = []
total_lines_input = 0
total_lines_output = 0
dropped_empty_lines = 0
for idx, scene in enumerate(scenes_data):
if not isinstance(scene, dict):
logger.warning(f"[ScriptGen] Scene {idx} is not a dict, skipping")
continue
title = scene.get("title") or f"Scene {idx + 1}"
duration = int(scene.get("duration") or max(30, (request.duration_minutes * 60) // max(1, len(scenes_data))))
emotion = scene.get("emotion") or "neutral"
if emotion not in valid_emotions:
logger.warning(f"[ScriptGen] Invalid emotion '{emotion}' in scene {idx}, defaulting to 'neutral'")
emotion = "neutral"
lines_raw = scene.get("lines") or []
total_lines_input += len(lines_raw)
lines: list[PodcastSceneLine] = []
for line_idx, line in enumerate(lines_raw):
if not isinstance(line, dict):
logger.warning(f"[ScriptGen] Line {line_idx} in scene {idx} is not a dict, skipping")
continue
speaker = line.get("speaker") or ("Host" if len(lines) % request.speakers == 0 else "Guest")
text = line.get("text") or ""
# Handle emphasis - convert various values to boolean
emphasis_raw = line.get("emphasis", False)
if isinstance(emphasis_raw, bool):
emphasis = emphasis_raw
elif isinstance(emphasis_raw, str):
emphasis = emphasis_raw.lower() in ("true", "yes", "1")
if emphasis_raw.lower() not in ("true", "false", "yes", "no", "1", "0"):
logger.debug(f"[ScriptGen] Unusual emphasis value '{emphasis_raw}' converted to {emphasis}")
else:
emphasis = bool(emphasis_raw)
# Generate line ID if not provided
line_id = line.get("id") or f"line-{idx + 1}-{line_idx + 1}"
# Get used fact IDs if provided
used_fact_ids = _normalize_fact_ids(line.get("usedFactIds") or line.get("used_fact_ids"))
tts_hints = line.get("ttsHints") or line.get("tts_hints") or None
if text:
lines.append(PodcastSceneLine(
speaker=speaker,
text=text,
emphasis=emphasis,
id=line_id,
usedFactIds=used_fact_ids,
ttsHints=tts_hints if isinstance(tts_hints, list) else None,
))
total_lines_output += 1
else:
dropped_empty_lines += 1
logger.debug(f"[ScriptGen] Dropped empty line {line_idx} in scene {idx}")
# Log scene status
if scenes_data and isinstance(scene, dict):
image_url_raw = scene.get("imageUrl") or scene.get("image_url")
audio_url_raw = scene.get("audioUrl") or scene.get("audio_url")
if image_url_raw:
logger.warning(f"[ScriptGen] Scene {idx} has imageUrl - will be reset to None")
if audio_url_raw:
logger.warning(f"[ScriptGen] Scene {idx} has audioUrl - will be reset to None")
# Keep each scene under TTS request size to prevent failures
scene_char_count = sum(len((l.text or "").strip()) for l in lines)
if scene_char_count > TARGET_TTS_CHARS_PER_SCENE and lines:
logger.warning(
f"[ScriptGen] Scene {idx} text too long ({scene_char_count} chars). "
f"Trimming to {TARGET_TTS_CHARS_PER_SCENE} target."
)
trimmed_lines: list[PodcastSceneLine] = []
remaining = TARGET_TTS_CHARS_PER_SCENE
for l in lines:
if remaining <= 0:
break
line_text = (l.text or "").strip()
if len(line_text) <= remaining:
trimmed_lines.append(l)
remaining -= len(line_text)
continue
l.text = f"{line_text[:max(0, remaining - 1)].rstrip()}"
trimmed_lines.append(l)
remaining = 0
lines = trimmed_lines
chart_data = scene.get("chart_data") or scene.get("chartData") or None
if podcast_mode == "audio_only" and not chart_data:
# Ensure audio-only always has a B-roll mapping fallback
chart_data = _default_chart_data(title)
scenes.append(
PodcastScene(
id=scene.get("id") or f"scene-{idx + 1}",
title=title,
duration=duration,
lines=lines,
approved=False,
emotion=emotion,
imageUrl=None, # Will be generated later
audioUrl=None, # Will be generated later
imagePrompt=None, # Will be generated during image generation
chart_data=chart_data if isinstance(chart_data, dict) else None,
)
)
# Summary logging
logger.warning(f"[ScriptGen] Script generated: {len(scenes)} scenes, {total_lines_output}/{total_lines_input} lines")
if dropped_empty_lines > 0:
logger.warning(f"[ScriptGen] Dropped {dropped_empty_lines} empty lines")
duration_ms = int((time.time() - start_time) * 1000)
logger.warning(f"[ScriptGen] ===== SCRIPT_GEN_END (took {duration_ms}ms) =====")
return PodcastScriptResponse(scenes=scenes)