From e68c289901bea121126d2244c4baa00356be644b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D9=8A?= Date: Mon, 20 Apr 2026 08:44:46 +0530 Subject: [PATCH] Harden audio-only script flow and mode propagation --- backend/api/podcast/handlers/script.py | 141 +++++++++++++++++- backend/api/podcast/models.py | 3 + .../PodcastMaker/PodcastDashboard.tsx | 1 + .../PodcastDashboard/usePodcastWorkflow.ts | 1 + .../ScriptEditor/ScriptEditor.tsx | 6 +- .../ScriptEditor/ScriptEditorContext.tsx | 3 +- frontend/src/services/podcastApi.ts | 4 + 7 files changed, 148 insertions(+), 11 deletions(-) diff --git a/backend/api/podcast/handlers/script.py b/backend/api/podcast/handlers/script.py index 7bf79152..ee2b274d 100644 --- a/backend/api/podcast/handlers/script.py +++ b/backend/api/podcast/handlers/script.py @@ -8,6 +8,7 @@ from fastapi import APIRouter, Depends, HTTPException from typing import Dict, Any, Optional from pydantic import BaseModel, Field import json +import re from middleware.auth_middleware import get_current_user from api.story_writer.utils.auth import require_authenticated_user @@ -23,6 +24,8 @@ from ..models import ( ) router = APIRouter() +MAX_TTS_CHARS_PER_REQUEST = 10_000 +TARGET_TTS_CHARS_PER_SCENE = 8_500 class SceneApprovalRequest(BaseModel): @@ -60,28 +63,43 @@ async def generate_podcast_script( logger.warning(f"[ScriptGen] ========== SCRIPT GENERATION START ==========") logger.warning(f"[ScriptGen] Topic: {request.idea[:60]}...") logger.warning(f"[ScriptGen] Duration: {request.duration_minutes} min, Speakers: {request.speakers}") - logger.warning(f"[ScriptGen] Has research: {bool(request.research)}, Has bible: {bool(request.bible)}, Has analysis: {bool(request.analysis)}") + podcast_mode = (request.podcast_mode or "video_only").strip().lower() + logger.warning(f"[ScriptGen] Has research: {bool(request.research)}, Has bible: {bool(request.bible)}, Has analysis: {bool(request.analysis)}, Mode: {podcast_mode}") + research_fact_cards = request.research.get("factCards", []) if request.research else [] # Build comprehensive research context for higher-quality scripts research_context = "" if request.research: try: key_insights = request.research.get("keyword_analysis", {}).get("key_insights") or [] - fact_cards = request.research.get("factCards", []) or [] + fact_cards = research_fact_cards or [] mapped_angles = request.research.get("mappedAngles", []) or [] sources = request.research.get("sources", []) or [] - top_facts = [f.get("quote", "") for f in fact_cards[:5] if f.get("quote")] + top_facts = [ + f"[{f.get('id') or f'fact_{idx + 1}'}] {f.get('quote', '')}" + for idx, f in enumerate(fact_cards[:10]) + if f.get("quote") + ] angles_summary = [ f"{a.get('title', '')}: {a.get('why', '')}" for a in mapped_angles[:3] if a.get("title") or a.get("why") ] top_sources = [s.get("url") for s in sources[:3] if s.get("url")] + numeric_signals = [] + for f in fact_cards[:12]: + quote = (f.get("quote") or "").strip() + if any(ch.isdigit() for ch in quote): + numeric_signals.append(quote[:180]) + if len(numeric_signals) >= 5: + break research_parts = [] if key_insights: research_parts.append(f"Key Insights: {', '.join(key_insights[:5])}") if top_facts: research_parts.append(f"Key Facts: {', '.join(top_facts)}") + if numeric_signals: + research_parts.append(f"Numeric Signals (prefer for chart scenes): {' | '.join(numeric_signals)}") if angles_summary: research_parts.append(f"Research Angles: {' | '.join(angles_summary)}") if top_sources: @@ -92,6 +110,51 @@ async def generate_podcast_script( logger.warning(f"Failed to parse research context: {exc}") research_context = "" + def _normalize_fact_ids(value: Any) -> Optional[list[str]]: + if not value: + return None + if isinstance(value, list): + cleaned = [str(v).strip() for v in value if str(v).strip()] + return cleaned or None + if isinstance(value, str) and value.strip(): + return [value.strip()] + return None + + def _default_chart_data(scene_title: str) -> Dict[str, Any]: + numeric_pairs: list[tuple[str, float]] = [] + for fact in research_fact_cards[:12]: + quote = (fact.get("quote") or "").strip() + if not quote: + continue + nums = re.findall(r"\d+(?:\.\d+)?", quote.replace(",", "")) + if not nums: + continue + label = quote[:48] + ("…" if len(quote) > 48 else "") + try: + numeric_pairs.append((label, float(nums[0]))) + except ValueError: + continue + if len(numeric_pairs) >= 5: + break + + if numeric_pairs: + labels = [p[0] for p in numeric_pairs] + values = [p[1] for p in numeric_pairs] + return { + "type": "bar_comparison", + "title": scene_title, + "labels": labels, + "values": values, + "takeaway": "Data points sourced from research facts used in this scene.", + } + + return { + "type": "bullet", + "title": scene_title, + "bullet_points": ["Key point 1", "Key point 2", "Key point 3"], + "takeaway": "Narration summary for this scene.", + } + # Extract Podcast Bible context for hyper-personalization bible_context = "" if request.bible: @@ -122,25 +185,58 @@ async def generate_podcast_script( except: pass + mode_instructions = "" + if podcast_mode == "audio_only": + mode_instructions = f""" +AUDIO-ONLY MODE RULES (CRITICAL): +- This is an audio-only episode. Do NOT include avatar/image/camera instructions. +- Keep each scene's total dialogue under {TARGET_TTS_CHARS_PER_SCENE} chars to stay below TTS max request size ({MAX_TTS_CHARS_PER_REQUEST}). +- For every scene include chart_data so B-roll charts can be generated while narration plays. +- Build script STRICTLY from RESEARCH context and cite fact linkage via usedFactIds. +- If evidence is weak, say uncertainty explicitly rather than inventing facts. +- Add natural TTS pacing in dialogue with markers like [pause:300ms], [pause:700ms], [emote:curious], [emote:serious]. +""" + elif podcast_mode == "audio_video": + mode_instructions = """ +AUDIO+VIDEO MODE: +- Include rich narration that works for both listening and visual storytelling. +- Use a balanced pace suitable for TTS and scene visuals. +""" + else: + mode_instructions = """ +VIDEO-ONLY MODE: +- Prioritize visual rhythm and concise narration per scene. +""" + prompt = f"""Create a podcast script with scenes and dialogue. {f"BIBLE: {bible_context[:1500]}" if bible_context else ""} {f"{analysis_context}" if analysis_context else ""} {f"{outline_context}" if outline_context else ""} -{f"RESEARCH: {research_context[:1200]}" if research_context else ""} +{f"RESEARCH: {research_context[:2500]}" if research_context else ""} +{mode_instructions} Topic: "{request.idea}" Duration: {request.duration_minutes} min | Speakers: {request.speakers} +Podcast mode: {podcast_mode} Return JSON with scenes array. Each scene: - id: string - title: short title (<=50 chars) - duration: seconds (total/5) - emotion: neutral|happy|excited|serious|curious|confident -- lines: array of {{speaker, text, emphasis}} +- lines: array of {{speaker, text, emphasis, usedFactIds, ttsHints}} - Use 2-4 LINES PER SCENE (shorter script = lower TTS costs) - Each line: 1-3 sentences, conversational + - usedFactIds: include related fact ids when research facts are available (example: ["fact_1", "fact_3"]) + - ttsHints: optional list from [pause_300ms, pause_700ms, smile, serious_tone, emphasize_data] - Plain text only, no markdown +- chart_data: object for B-roll mapping (required in audio_only) + - type: bar_comparison|line_trend|bullet_points + - title: short chart title + - labels: list + - values: list (same length as labels) + - takeaway: one sentence tying chart to narration COST OPTIMIZATION: - 5-6 scenes max for {request.duration_minutes} min episode @@ -231,7 +327,8 @@ COST OPTIMIZATION: line_id = line.get("id") or f"line-{idx + 1}-{line_idx + 1}" # Get used fact IDs if provided - used_fact_ids = line.get("usedFactIds") or line.get("used_fact_ids") or None + used_fact_ids = _normalize_fact_ids(line.get("usedFactIds") or line.get("used_fact_ids")) + tts_hints = line.get("ttsHints") or line.get("tts_hints") or None if text: lines.append(PodcastSceneLine( @@ -239,7 +336,8 @@ COST OPTIMIZATION: text=text, emphasis=emphasis, id=line_id, - usedFactIds=used_fact_ids + usedFactIds=used_fact_ids, + ttsHints=tts_hints if isinstance(tts_hints, list) else None, )) total_lines_output += 1 else: @@ -255,6 +353,33 @@ COST OPTIMIZATION: if audio_url_raw: logger.warning(f"[ScriptGen] Scene {idx} has audioUrl - will be reset to None") + # Keep each scene under TTS request size to prevent failures + scene_char_count = sum(len((l.text or "").strip()) for l in lines) + if scene_char_count > TARGET_TTS_CHARS_PER_SCENE and lines: + logger.warning( + f"[ScriptGen] Scene {idx} text too long ({scene_char_count} chars). " + f"Trimming to {TARGET_TTS_CHARS_PER_SCENE} target." + ) + trimmed_lines: list[PodcastSceneLine] = [] + remaining = TARGET_TTS_CHARS_PER_SCENE + for l in lines: + if remaining <= 0: + break + line_text = (l.text or "").strip() + if len(line_text) <= remaining: + trimmed_lines.append(l) + remaining -= len(line_text) + continue + l.text = f"{line_text[:max(0, remaining - 1)].rstrip()}…" + trimmed_lines.append(l) + remaining = 0 + lines = trimmed_lines + + chart_data = scene.get("chart_data") or scene.get("chartData") or None + if podcast_mode == "audio_only" and not chart_data: + # Ensure audio-only always has a B-roll mapping fallback + chart_data = _default_chart_data(title) + scenes.append( PodcastScene( id=scene.get("id") or f"scene-{idx + 1}", @@ -266,6 +391,7 @@ COST OPTIMIZATION: imageUrl=None, # Will be generated later audioUrl=None, # Will be generated later imagePrompt=None, # Will be generated during image generation + chart_data=chart_data if isinstance(chart_data, dict) else None, ) ) @@ -275,4 +401,3 @@ COST OPTIMIZATION: logger.warning(f"[ScriptGen] Dropped {dropped_empty_lines} empty lines") return PodcastScriptResponse(scenes=scenes) - diff --git a/backend/api/podcast/models.py b/backend/api/podcast/models.py index 2fb7349d..7704e186 100644 --- a/backend/api/podcast/models.py +++ b/backend/api/podcast/models.py @@ -97,6 +97,7 @@ class PodcastScriptRequest(BaseModel): bible: Optional[Dict[str, Any]] = Field(None, description="Podcast Bible for hyper-personalization") outline: Optional[Dict[str, Any]] = Field(None, description="The refined episode outline to follow") analysis: Optional[Dict[str, Any]] = Field(None, description="The full analysis context (audience, keywords, etc.)") + podcast_mode: Optional[str] = Field(default="video_only", description="Podcast mode: audio_only, video_only, or audio_video") class PodcastSceneLine(BaseModel): @@ -105,6 +106,7 @@ class PodcastSceneLine(BaseModel): emphasis: Optional[bool] = False id: Optional[str] = None # Optional line ID for frontend tracking usedFactIds: Optional[List[str]] = None # Facts referenced in this line + ttsHints: Optional[List[str]] = None # Optional TTS hints, e.g. pause_300ms, smile, emphasize_data class PodcastScene(BaseModel): @@ -117,6 +119,7 @@ class PodcastScene(BaseModel): imageUrl: Optional[str] = None # Generated image URL for video generation audioUrl: Optional[str] = None # Generated audio URL for this scene imagePrompt: Optional[str] = None # Original image generation prompt for video context + chart_data: Optional[Dict[str, Any]] = None # Optional chart mapping for B-roll scenes class PodcastExaConfig(BaseModel): diff --git a/frontend/src/components/PodcastMaker/PodcastDashboard.tsx b/frontend/src/components/PodcastMaker/PodcastDashboard.tsx index b8962e19..b14c3f22 100644 --- a/frontend/src/components/PodcastMaker/PodcastDashboard.tsx +++ b/frontend/src/components/PodcastMaker/PodcastDashboard.tsx @@ -289,6 +289,7 @@ const PodcastDashboard: React.FC = () => { knobs={knobsState} speakers={project.speakers} durationMinutes={project.duration} + podcastMode={project?.podcastMode || "video_only"} script={scriptData} analysis={analysis} outline={analysis?.suggestedOutlines?.[0]} diff --git a/frontend/src/components/PodcastMaker/PodcastDashboard/usePodcastWorkflow.ts b/frontend/src/components/PodcastMaker/PodcastDashboard/usePodcastWorkflow.ts index 0a56aa4d..5491e0cf 100644 --- a/frontend/src/components/PodcastMaker/PodcastDashboard/usePodcastWorkflow.ts +++ b/frontend/src/components/PodcastMaker/PodcastDashboard/usePodcastWorkflow.ts @@ -452,6 +452,7 @@ export const usePodcastWorkflow = ({ projectState, onError }: UsePodcastWorkflow knobs: projectState.knobs, speakers: project.speakers, durationMinutes: project.duration, + podcastMode: (project as any)?.podcastMode || "video_only", bible: projectState.bible, outline: analysis?.suggestedOutlines?.[0], analysis: analysis, diff --git a/frontend/src/components/PodcastMaker/ScriptEditor/ScriptEditor.tsx b/frontend/src/components/PodcastMaker/ScriptEditor/ScriptEditor.tsx index c7cd0a80..70dab594 100644 --- a/frontend/src/components/PodcastMaker/ScriptEditor/ScriptEditor.tsx +++ b/frontend/src/components/PodcastMaker/ScriptEditor/ScriptEditor.tsx @@ -25,6 +25,7 @@ interface ScriptEditorProps { avatarUrl?: string | null; // Base avatar URL for consistent scene image generation analysis?: any; outline?: any; + podcastMode?: "audio_only" | "video_only" | "audio_video"; } export const ScriptEditor: React.FC = ({ @@ -43,6 +44,7 @@ export const ScriptEditor: React.FC = ({ avatarUrl, analysis, outline, + podcastMode = "video_only", }) => { const [script, setScript] = useState