Harden audio-only script flow and mode propagation

This commit is contained in:
ي
2026-04-20 08:44:46 +05:30
parent ba9ddbf368
commit e68c289901
7 changed files with 148 additions and 11 deletions

View File

@@ -8,6 +8,7 @@ from fastapi import APIRouter, Depends, HTTPException
from typing import Dict, Any, Optional from typing import Dict, Any, Optional
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
import json import json
import re
from middleware.auth_middleware import get_current_user from middleware.auth_middleware import get_current_user
from api.story_writer.utils.auth import require_authenticated_user from api.story_writer.utils.auth import require_authenticated_user
@@ -23,6 +24,8 @@ from ..models import (
) )
router = APIRouter() router = APIRouter()
MAX_TTS_CHARS_PER_REQUEST = 10_000
TARGET_TTS_CHARS_PER_SCENE = 8_500
class SceneApprovalRequest(BaseModel): class SceneApprovalRequest(BaseModel):
@@ -60,28 +63,43 @@ async def generate_podcast_script(
logger.warning(f"[ScriptGen] ========== SCRIPT GENERATION START ==========") logger.warning(f"[ScriptGen] ========== SCRIPT GENERATION START ==========")
logger.warning(f"[ScriptGen] Topic: {request.idea[:60]}...") logger.warning(f"[ScriptGen] Topic: {request.idea[:60]}...")
logger.warning(f"[ScriptGen] Duration: {request.duration_minutes} min, Speakers: {request.speakers}") logger.warning(f"[ScriptGen] Duration: {request.duration_minutes} min, Speakers: {request.speakers}")
logger.warning(f"[ScriptGen] Has research: {bool(request.research)}, Has bible: {bool(request.bible)}, Has analysis: {bool(request.analysis)}") podcast_mode = (request.podcast_mode or "video_only").strip().lower()
logger.warning(f"[ScriptGen] Has research: {bool(request.research)}, Has bible: {bool(request.bible)}, Has analysis: {bool(request.analysis)}, Mode: {podcast_mode}")
research_fact_cards = request.research.get("factCards", []) if request.research else []
# Build comprehensive research context for higher-quality scripts # Build comprehensive research context for higher-quality scripts
research_context = "" research_context = ""
if request.research: if request.research:
try: try:
key_insights = request.research.get("keyword_analysis", {}).get("key_insights") or [] key_insights = request.research.get("keyword_analysis", {}).get("key_insights") or []
fact_cards = request.research.get("factCards", []) or [] fact_cards = research_fact_cards or []
mapped_angles = request.research.get("mappedAngles", []) or [] mapped_angles = request.research.get("mappedAngles", []) or []
sources = request.research.get("sources", []) or [] sources = request.research.get("sources", []) or []
top_facts = [f.get("quote", "") for f in fact_cards[:5] if f.get("quote")] top_facts = [
f"[{f.get('id') or f'fact_{idx + 1}'}] {f.get('quote', '')}"
for idx, f in enumerate(fact_cards[:10])
if f.get("quote")
]
angles_summary = [ angles_summary = [
f"{a.get('title', '')}: {a.get('why', '')}" for a in mapped_angles[:3] if a.get("title") or a.get("why") f"{a.get('title', '')}: {a.get('why', '')}" for a in mapped_angles[:3] if a.get("title") or a.get("why")
] ]
top_sources = [s.get("url") for s in sources[:3] if s.get("url")] top_sources = [s.get("url") for s in sources[:3] if s.get("url")]
numeric_signals = []
for f in fact_cards[:12]:
quote = (f.get("quote") or "").strip()
if any(ch.isdigit() for ch in quote):
numeric_signals.append(quote[:180])
if len(numeric_signals) >= 5:
break
research_parts = [] research_parts = []
if key_insights: if key_insights:
research_parts.append(f"Key Insights: {', '.join(key_insights[:5])}") research_parts.append(f"Key Insights: {', '.join(key_insights[:5])}")
if top_facts: if top_facts:
research_parts.append(f"Key Facts: {', '.join(top_facts)}") research_parts.append(f"Key Facts: {', '.join(top_facts)}")
if numeric_signals:
research_parts.append(f"Numeric Signals (prefer for chart scenes): {' | '.join(numeric_signals)}")
if angles_summary: if angles_summary:
research_parts.append(f"Research Angles: {' | '.join(angles_summary)}") research_parts.append(f"Research Angles: {' | '.join(angles_summary)}")
if top_sources: if top_sources:
@@ -92,6 +110,51 @@ async def generate_podcast_script(
logger.warning(f"Failed to parse research context: {exc}") logger.warning(f"Failed to parse research context: {exc}")
research_context = "" research_context = ""
def _normalize_fact_ids(value: Any) -> Optional[list[str]]:
if not value:
return None
if isinstance(value, list):
cleaned = [str(v).strip() for v in value if str(v).strip()]
return cleaned or None
if isinstance(value, str) and value.strip():
return [value.strip()]
return None
def _default_chart_data(scene_title: str) -> Dict[str, Any]:
numeric_pairs: list[tuple[str, float]] = []
for fact in research_fact_cards[:12]:
quote = (fact.get("quote") or "").strip()
if not quote:
continue
nums = re.findall(r"\d+(?:\.\d+)?", quote.replace(",", ""))
if not nums:
continue
label = quote[:48] + ("" if len(quote) > 48 else "")
try:
numeric_pairs.append((label, float(nums[0])))
except ValueError:
continue
if len(numeric_pairs) >= 5:
break
if numeric_pairs:
labels = [p[0] for p in numeric_pairs]
values = [p[1] for p in numeric_pairs]
return {
"type": "bar_comparison",
"title": scene_title,
"labels": labels,
"values": values,
"takeaway": "Data points sourced from research facts used in this scene.",
}
return {
"type": "bullet",
"title": scene_title,
"bullet_points": ["Key point 1", "Key point 2", "Key point 3"],
"takeaway": "Narration summary for this scene.",
}
# Extract Podcast Bible context for hyper-personalization # Extract Podcast Bible context for hyper-personalization
bible_context = "" bible_context = ""
if request.bible: if request.bible:
@@ -122,25 +185,58 @@ async def generate_podcast_script(
except: except:
pass pass
mode_instructions = ""
if podcast_mode == "audio_only":
mode_instructions = f"""
AUDIO-ONLY MODE RULES (CRITICAL):
- This is an audio-only episode. Do NOT include avatar/image/camera instructions.
- Keep each scene's total dialogue under {TARGET_TTS_CHARS_PER_SCENE} chars to stay below TTS max request size ({MAX_TTS_CHARS_PER_REQUEST}).
- For every scene include chart_data so B-roll charts can be generated while narration plays.
- Build script STRICTLY from RESEARCH context and cite fact linkage via usedFactIds.
- If evidence is weak, say uncertainty explicitly rather than inventing facts.
- Add natural TTS pacing in dialogue with markers like [pause:300ms], [pause:700ms], [emote:curious], [emote:serious].
"""
elif podcast_mode == "audio_video":
mode_instructions = """
AUDIO+VIDEO MODE:
- Include rich narration that works for both listening and visual storytelling.
- Use a balanced pace suitable for TTS and scene visuals.
"""
else:
mode_instructions = """
VIDEO-ONLY MODE:
- Prioritize visual rhythm and concise narration per scene.
"""
prompt = f"""Create a podcast script with scenes and dialogue. prompt = f"""Create a podcast script with scenes and dialogue.
{f"BIBLE: {bible_context[:1500]}" if bible_context else ""} {f"BIBLE: {bible_context[:1500]}" if bible_context else ""}
{f"{analysis_context}" if analysis_context else ""} {f"{analysis_context}" if analysis_context else ""}
{f"{outline_context}" if outline_context else ""} {f"{outline_context}" if outline_context else ""}
{f"RESEARCH: {research_context[:1200]}" if research_context else ""} {f"RESEARCH: {research_context[:2500]}" if research_context else ""}
{mode_instructions}
Topic: "{request.idea}" Topic: "{request.idea}"
Duration: {request.duration_minutes} min | Speakers: {request.speakers} Duration: {request.duration_minutes} min | Speakers: {request.speakers}
Podcast mode: {podcast_mode}
Return JSON with scenes array. Each scene: Return JSON with scenes array. Each scene:
- id: string - id: string
- title: short title (<=50 chars) - title: short title (<=50 chars)
- duration: seconds (total/5) - duration: seconds (total/5)
- emotion: neutral|happy|excited|serious|curious|confident - emotion: neutral|happy|excited|serious|curious|confident
- lines: array of {{speaker, text, emphasis}} - lines: array of {{speaker, text, emphasis, usedFactIds, ttsHints}}
- Use 2-4 LINES PER SCENE (shorter script = lower TTS costs) - Use 2-4 LINES PER SCENE (shorter script = lower TTS costs)
- Each line: 1-3 sentences, conversational - Each line: 1-3 sentences, conversational
- usedFactIds: include related fact ids when research facts are available (example: ["fact_1", "fact_3"])
- ttsHints: optional list from [pause_300ms, pause_700ms, smile, serious_tone, emphasize_data]
- Plain text only, no markdown - Plain text only, no markdown
- chart_data: object for B-roll mapping (required in audio_only)
- type: bar_comparison|line_trend|bullet_points
- title: short chart title
- labels: list
- values: list (same length as labels)
- takeaway: one sentence tying chart to narration
COST OPTIMIZATION: COST OPTIMIZATION:
- 5-6 scenes max for {request.duration_minutes} min episode - 5-6 scenes max for {request.duration_minutes} min episode
@@ -231,7 +327,8 @@ COST OPTIMIZATION:
line_id = line.get("id") or f"line-{idx + 1}-{line_idx + 1}" line_id = line.get("id") or f"line-{idx + 1}-{line_idx + 1}"
# Get used fact IDs if provided # Get used fact IDs if provided
used_fact_ids = line.get("usedFactIds") or line.get("used_fact_ids") or None used_fact_ids = _normalize_fact_ids(line.get("usedFactIds") or line.get("used_fact_ids"))
tts_hints = line.get("ttsHints") or line.get("tts_hints") or None
if text: if text:
lines.append(PodcastSceneLine( lines.append(PodcastSceneLine(
@@ -239,7 +336,8 @@ COST OPTIMIZATION:
text=text, text=text,
emphasis=emphasis, emphasis=emphasis,
id=line_id, id=line_id,
usedFactIds=used_fact_ids usedFactIds=used_fact_ids,
ttsHints=tts_hints if isinstance(tts_hints, list) else None,
)) ))
total_lines_output += 1 total_lines_output += 1
else: else:
@@ -255,6 +353,33 @@ COST OPTIMIZATION:
if audio_url_raw: if audio_url_raw:
logger.warning(f"[ScriptGen] Scene {idx} has audioUrl - will be reset to None") logger.warning(f"[ScriptGen] Scene {idx} has audioUrl - will be reset to None")
# Keep each scene under TTS request size to prevent failures
scene_char_count = sum(len((l.text or "").strip()) for l in lines)
if scene_char_count > TARGET_TTS_CHARS_PER_SCENE and lines:
logger.warning(
f"[ScriptGen] Scene {idx} text too long ({scene_char_count} chars). "
f"Trimming to {TARGET_TTS_CHARS_PER_SCENE} target."
)
trimmed_lines: list[PodcastSceneLine] = []
remaining = TARGET_TTS_CHARS_PER_SCENE
for l in lines:
if remaining <= 0:
break
line_text = (l.text or "").strip()
if len(line_text) <= remaining:
trimmed_lines.append(l)
remaining -= len(line_text)
continue
l.text = f"{line_text[:max(0, remaining - 1)].rstrip()}"
trimmed_lines.append(l)
remaining = 0
lines = trimmed_lines
chart_data = scene.get("chart_data") or scene.get("chartData") or None
if podcast_mode == "audio_only" and not chart_data:
# Ensure audio-only always has a B-roll mapping fallback
chart_data = _default_chart_data(title)
scenes.append( scenes.append(
PodcastScene( PodcastScene(
id=scene.get("id") or f"scene-{idx + 1}", id=scene.get("id") or f"scene-{idx + 1}",
@@ -266,6 +391,7 @@ COST OPTIMIZATION:
imageUrl=None, # Will be generated later imageUrl=None, # Will be generated later
audioUrl=None, # Will be generated later audioUrl=None, # Will be generated later
imagePrompt=None, # Will be generated during image generation imagePrompt=None, # Will be generated during image generation
chart_data=chart_data if isinstance(chart_data, dict) else None,
) )
) )
@@ -275,4 +401,3 @@ COST OPTIMIZATION:
logger.warning(f"[ScriptGen] Dropped {dropped_empty_lines} empty lines") logger.warning(f"[ScriptGen] Dropped {dropped_empty_lines} empty lines")
return PodcastScriptResponse(scenes=scenes) return PodcastScriptResponse(scenes=scenes)

View File

@@ -97,6 +97,7 @@ class PodcastScriptRequest(BaseModel):
bible: Optional[Dict[str, Any]] = Field(None, description="Podcast Bible for hyper-personalization") bible: Optional[Dict[str, Any]] = Field(None, description="Podcast Bible for hyper-personalization")
outline: Optional[Dict[str, Any]] = Field(None, description="The refined episode outline to follow") outline: Optional[Dict[str, Any]] = Field(None, description="The refined episode outline to follow")
analysis: Optional[Dict[str, Any]] = Field(None, description="The full analysis context (audience, keywords, etc.)") analysis: Optional[Dict[str, Any]] = Field(None, description="The full analysis context (audience, keywords, etc.)")
podcast_mode: Optional[str] = Field(default="video_only", description="Podcast mode: audio_only, video_only, or audio_video")
class PodcastSceneLine(BaseModel): class PodcastSceneLine(BaseModel):
@@ -105,6 +106,7 @@ class PodcastSceneLine(BaseModel):
emphasis: Optional[bool] = False emphasis: Optional[bool] = False
id: Optional[str] = None # Optional line ID for frontend tracking id: Optional[str] = None # Optional line ID for frontend tracking
usedFactIds: Optional[List[str]] = None # Facts referenced in this line usedFactIds: Optional[List[str]] = None # Facts referenced in this line
ttsHints: Optional[List[str]] = None # Optional TTS hints, e.g. pause_300ms, smile, emphasize_data
class PodcastScene(BaseModel): class PodcastScene(BaseModel):
@@ -117,6 +119,7 @@ class PodcastScene(BaseModel):
imageUrl: Optional[str] = None # Generated image URL for video generation imageUrl: Optional[str] = None # Generated image URL for video generation
audioUrl: Optional[str] = None # Generated audio URL for this scene audioUrl: Optional[str] = None # Generated audio URL for this scene
imagePrompt: Optional[str] = None # Original image generation prompt for video context imagePrompt: Optional[str] = None # Original image generation prompt for video context
chart_data: Optional[Dict[str, Any]] = None # Optional chart mapping for B-roll scenes
class PodcastExaConfig(BaseModel): class PodcastExaConfig(BaseModel):

View File

@@ -289,6 +289,7 @@ const PodcastDashboard: React.FC = () => {
knobs={knobsState} knobs={knobsState}
speakers={project.speakers} speakers={project.speakers}
durationMinutes={project.duration} durationMinutes={project.duration}
podcastMode={project?.podcastMode || "video_only"}
script={scriptData} script={scriptData}
analysis={analysis} analysis={analysis}
outline={analysis?.suggestedOutlines?.[0]} outline={analysis?.suggestedOutlines?.[0]}

View File

@@ -452,6 +452,7 @@ export const usePodcastWorkflow = ({ projectState, onError }: UsePodcastWorkflow
knobs: projectState.knobs, knobs: projectState.knobs,
speakers: project.speakers, speakers: project.speakers,
durationMinutes: project.duration, durationMinutes: project.duration,
podcastMode: (project as any)?.podcastMode || "video_only",
bible: projectState.bible, bible: projectState.bible,
outline: analysis?.suggestedOutlines?.[0], outline: analysis?.suggestedOutlines?.[0],
analysis: analysis, analysis: analysis,

View File

@@ -25,6 +25,7 @@ interface ScriptEditorProps {
avatarUrl?: string | null; // Base avatar URL for consistent scene image generation avatarUrl?: string | null; // Base avatar URL for consistent scene image generation
analysis?: any; analysis?: any;
outline?: any; outline?: any;
podcastMode?: "audio_only" | "video_only" | "audio_video";
} }
export const ScriptEditor: React.FC<ScriptEditorProps> = ({ export const ScriptEditor: React.FC<ScriptEditorProps> = ({
@@ -43,6 +44,7 @@ export const ScriptEditor: React.FC<ScriptEditorProps> = ({
avatarUrl, avatarUrl,
analysis, analysis,
outline, outline,
podcastMode = "video_only",
}) => { }) => {
const [script, setScript] = useState<Script | null>(initialScript); const [script, setScript] = useState<Script | null>(initialScript);
const [loading, setLoading] = useState(false); const [loading, setLoading] = useState(false);
@@ -93,6 +95,7 @@ export const ScriptEditor: React.FC<ScriptEditorProps> = ({
knobs, knobs,
speakers, speakers,
durationMinutes, durationMinutes,
podcastMode,
analysis, analysis,
outline, outline,
}) })
@@ -112,7 +115,7 @@ export const ScriptEditor: React.FC<ScriptEditorProps> = ({
return () => { return () => {
mounted = false; mounted = false;
}; };
}, [projectId, rawResearch, idea, knobs, speakers, durationMinutes, analysis, outline, emitScriptChange, onError, script]); }, [projectId, rawResearch, idea, knobs, speakers, durationMinutes, podcastMode, analysis, outline, emitScriptChange, onError, script]);
const updateScene = (updated: Scene) => { const updateScene = (updated: Scene) => {
// Use functional update to ensure we're working with latest state // Use functional update to ensure we're working with latest state
@@ -837,4 +840,3 @@ export const ScriptEditor: React.FC<ScriptEditorProps> = ({
</Box> </Box>
); );
}; };

View File

@@ -208,6 +208,7 @@ export const ScriptEditorProvider: React.FC<ScriptEditorProviderProps> = ({
knobs, knobs,
speakers, speakers,
durationMinutes, durationMinutes,
podcastMode,
analysis, analysis,
outline, outline,
}) })
@@ -547,4 +548,4 @@ export const useScriptEditor = (): ScriptEditorContextType => {
throw new Error("useScriptEditor must be used within ScriptEditorProvider"); throw new Error("useScriptEditor must be used within ScriptEditorProvider");
} }
return context; return context;
}; };

View File

@@ -12,6 +12,7 @@ import {
Knobs, Knobs,
PodcastAnalysis, PodcastAnalysis,
PodcastEstimate, PodcastEstimate,
PodcastMode,
Query, Query,
RenderJobResult, RenderJobResult,
Research, Research,
@@ -403,6 +404,7 @@ export const podcastApi = {
knobs: Knobs; knobs: Knobs;
speakers: number; speakers: number;
durationMinutes: number; durationMinutes: number;
podcastMode?: PodcastMode;
bible?: any; bible?: any;
outline?: any; outline?: any;
analysis?: PodcastAnalysis | null; analysis?: PodcastAnalysis | null;
@@ -427,6 +429,7 @@ export const podcastApi = {
bible: params.bible, bible: params.bible,
outline: params.outline, outline: params.outline,
analysis: params.analysis, analysis: params.analysis,
podcast_mode: params.podcastMode || "video_only",
}); });
if (params.onProgress) { if (params.onProgress) {
@@ -453,6 +456,7 @@ export const podcastApi = {
}, },
], ],
approved: false, approved: false,
chart_data: scene.chart_data || scene.chartData || undefined,
})); }));
return { scenes: scriptScenes }; return { scenes: scriptScenes };