WIP: AI Podcast Maker and YouTube Creator Studio integration

This commit is contained in:
ajaysi
2025-12-10 09:37:55 +05:30
parent 31f078c763
commit 81590cf4db
75 changed files with 11879 additions and 1380 deletions

View File

@@ -0,0 +1,2 @@
"""YouTube Creator Studio services."""

View File

@@ -0,0 +1,358 @@
"""
YouTube Video Planner Service
Generates video plans, outlines, and insights using AI with persona integration.
"""
from typing import Dict, Any, Optional, List
from loguru import logger
from fastapi import HTTPException
from services.llm_providers.main_text_generation import llm_text_gen
from utils.logger_utils import get_service_logger
logger = get_service_logger("youtube.planner")
class YouTubePlannerService:
"""Service for planning YouTube videos with AI assistance."""
def __init__(self):
"""Initialize the planner service."""
logger.info("[YouTubePlanner] Service initialized")
def generate_video_plan(
self,
user_idea: str,
duration_type: str, # "shorts", "medium", "long"
persona_data: Optional[Dict[str, Any]] = None,
reference_image_description: Optional[str] = None,
source_content_id: Optional[str] = None, # For blog/story conversion
source_content_type: Optional[str] = None, # "blog", "story"
user_id: str = None,
include_scenes: bool = False, # For shorts: combine plan + scenes in one call
) -> Dict[str, Any]:
"""
Generate a comprehensive video plan from user input.
Args:
user_idea: User's video idea or topic
duration_type: "shorts" (≤60s), "medium" (1-4min), "long" (4-10min)
persona_data: Optional persona data for tone/style
reference_image_description: Optional description of reference image
source_content_id: Optional ID of source content (blog/story)
source_content_type: Type of source content
user_id: Clerk user ID for subscription checking
Returns:
Dictionary with video plan, outline, insights, and metadata
"""
try:
logger.info(
f"[YouTubePlanner] Generating plan: idea={user_idea[:50]}..., "
f"duration={duration_type}, user={user_id}"
)
# Build persona context
persona_context = self._build_persona_context(persona_data)
# Build duration context
duration_context = self._get_duration_context(duration_type)
# Build source content context if provided
source_context = ""
if source_content_id and source_content_type:
source_context = f"""
**Source Content:**
- Type: {source_content_type}
- ID: {source_content_id}
- Note: This video should be based on the existing {source_content_type} content.
"""
# Build reference image context
image_context = ""
if reference_image_description:
image_context = f"""
**Reference Image:**
{reference_image_description}
- Use this as visual inspiration for the video
"""
# Generate comprehensive video plan
planning_prompt = f"""You are an expert YouTube content strategist. Create a comprehensive video plan based on the user's idea.
**User's Video Idea:**
{user_idea}
**Video Duration Type:**
{duration_type} ({duration_context['description']})
**Duration Guidelines:**
- Target length: {duration_context['target_seconds']} seconds
- Hook duration: {duration_context['hook_seconds']} seconds
- Main content: {duration_context['main_seconds']} seconds
- CTA duration: {duration_context['cta_seconds']} seconds
- Maximum scenes: {duration_context['max_scenes']} (for shorts, keep 2-4 scenes total)
{persona_context}
{source_context}
{image_context}
**Your Task:**
Create a detailed video plan that includes:
1. **Video Summary**: A 2-3 sentence overview of what the video will cover
2. **Target Audience**: Who this video is for
3. **Video Goal**: Primary objective (educate, entertain, sell, inspire, etc.)
4. **Key Message**: The main takeaway viewers should remember
5. **Hook Strategy**: Attention-grabbing opening (first {duration_context['hook_seconds']} seconds)
6. **Content Outline**: High-level structure with 3-5 main sections
7. **Call-to-Action**: Clear CTA that fits the video goal
8. **Visual Style**: Recommended visual approach (cinematic, tutorial, vlog, etc.)
9. **Tone**: Recommended tone (professional, casual, energetic, etc.)
10. **SEO Keywords**: 5-7 relevant keywords for YouTube SEO
**Format your response as JSON:**
{{
"video_summary": "...",
"target_audience": "...",
"video_goal": "...",
"key_message": "...",
"hook_strategy": "...",
"content_outline": [
{{"section": "Section 1", "description": "...", "duration_estimate": 30}},
{{"section": "Section 2", "description": "...", "duration_estimate": 45}}
],
"call_to_action": "...",
"visual_style": "...",
"tone": "...",
"seo_keywords": ["keyword1", "keyword2", ...]
}}
Make sure the content outline fits within the {duration_type} duration constraints.
"""
system_prompt = (
"You are an expert YouTube content strategist specializing in creating "
"engaging, well-structured video plans. Your plans are data-driven, "
"audience-focused, and optimized for YouTube's algorithm."
)
# For shorts, combine plan + scenes in one call to save API calls
if include_scenes and duration_type == "shorts":
planning_prompt += f"""
**IMPORTANT: Since this is a SHORTS video, also generate the complete scene breakdown in the same response.**
**Additional Task - Generate Detailed Scenes:**
Create detailed scenes (up to {duration_context['max_scenes']} scenes) that include:
1. Scene number and title
2. Narration text (what will be spoken) - keep it concise for shorts
3. Visual description (what viewers will see)
4. Duration estimate (2-8 seconds each)
5. Emphasis tags (hook, main_content, transition, cta)
**Scene Format:**
Each scene should be detailed enough for video generation. Total duration must fit within {duration_context['target_seconds']} seconds.
**Update JSON structure to include "scenes" array:**
Add a "scenes" field with the complete scene breakdown.
"""
json_struct = {
"type": "object",
"properties": {
"video_summary": {"type": "string"},
"target_audience": {"type": "string"},
"video_goal": {"type": "string"},
"key_message": {"type": "string"},
"hook_strategy": {"type": "string"},
"content_outline": {
"type": "array",
"items": {
"type": "object",
"properties": {
"section": {"type": "string"},
"description": {"type": "string"},
"duration_estimate": {"type": "number"}
}
}
},
"call_to_action": {"type": "string"},
"visual_style": {"type": "string"},
"tone": {"type": "string"},
"seo_keywords": {
"type": "array",
"items": {"type": "string"}
},
"scenes": {
"type": "array",
"items": {
"type": "object",
"properties": {
"scene_number": {"type": "number"},
"title": {"type": "string"},
"narration": {"type": "string"},
"visual_description": {"type": "string"},
"duration_estimate": {"type": "number"},
"emphasis": {"type": "string"},
"visual_cues": {
"type": "array",
"items": {"type": "string"}
}
},
"required": [
"scene_number", "title", "narration", "visual_description",
"duration_estimate", "emphasis"
]
}
}
},
"required": [
"video_summary", "target_audience", "video_goal", "key_message",
"hook_strategy", "content_outline", "call_to_action",
"visual_style", "tone", "seo_keywords", "scenes"
]
}
else:
json_struct = {
"type": "object",
"properties": {
"video_summary": {"type": "string"},
"target_audience": {"type": "string"},
"video_goal": {"type": "string"},
"key_message": {"type": "string"},
"hook_strategy": {"type": "string"},
"content_outline": {
"type": "array",
"items": {
"type": "object",
"properties": {
"section": {"type": "string"},
"description": {"type": "string"},
"duration_estimate": {"type": "number"}
}
}
},
"call_to_action": {"type": "string"},
"visual_style": {"type": "string"},
"tone": {"type": "string"},
"seo_keywords": {
"type": "array",
"items": {"type": "string"}
}
},
"required": [
"video_summary", "target_audience", "video_goal", "key_message",
"hook_strategy", "content_outline", "call_to_action",
"visual_style", "tone", "seo_keywords"
]
}
# Generate plan using LLM
response = llm_text_gen(
prompt=planning_prompt,
system_prompt=system_prompt,
user_id=user_id,
json_struct=json_struct
)
# Parse response (handle both dict and JSON string)
if isinstance(response, dict):
plan_data = response
else:
import json
plan_data = json.loads(response)
# Add metadata
plan_data["duration_type"] = duration_type
plan_data["duration_metadata"] = duration_context
plan_data["user_idea"] = user_idea
# If scenes were included, mark them for scene builder
if include_scenes and duration_type == "shorts" and "scenes" in plan_data:
plan_data["_scenes_included"] = True
logger.info(
f"[YouTubePlanner] ✅ Plan + {len(plan_data.get('scenes', []))} scenes "
f"generated in 1 AI call (optimized for shorts)"
)
else:
if include_scenes and duration_type == "shorts":
# LLM did not return scenes; downstream will regenerate
plan_data["_scenes_included"] = False
logger.warning(
"[YouTubePlanner] Shorts optimization requested but no scenes returned; "
"scene builder will generate scenes separately."
)
logger.info(f"[YouTubePlanner] ✅ Plan generated successfully")
return plan_data
except HTTPException:
raise
except Exception as e:
logger.error(f"[YouTubePlanner] Error generating plan: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Failed to generate video plan: {str(e)}"
)
def _build_persona_context(self, persona_data: Optional[Dict[str, Any]]) -> str:
"""Build persona context string for prompts."""
if not persona_data:
return """
**Persona Context:**
- Using default professional tone
- No specific persona constraints
"""
core_persona = persona_data.get("core_persona", {})
tone = core_persona.get("tone", "professional")
voice = core_persona.get("voice_characteristics", {})
return f"""
**Persona Context:**
- Tone: {tone}
- Voice Style: {voice.get('style', 'professional')}
- Communication Style: {voice.get('communication_style', 'clear and direct')}
- Brand Values: {core_persona.get('core_belief', 'value-driven content')}
- Use this persona to guide the video's tone, style, and messaging approach.
"""
def _get_duration_context(self, duration_type: str) -> Dict[str, Any]:
"""Get duration-specific context and constraints."""
contexts = {
"shorts": {
"description": "YouTube Shorts (15-60 seconds)",
"target_seconds": 30,
"hook_seconds": 3,
"main_seconds": 24,
"cta_seconds": 3,
# Keep scenes tight for shorts to control cost and pacing
"max_scenes": 4,
"scene_duration_range": (2, 8)
},
"medium": {
"description": "Medium-length video (1-4 minutes)",
"target_seconds": 150, # 2.5 minutes
"hook_seconds": 10,
"main_seconds": 130,
"cta_seconds": 10,
"max_scenes": 12,
"scene_duration_range": (5, 15)
},
"long": {
"description": "Long-form video (4-10 minutes)",
"target_seconds": 420, # 7 minutes
"hook_seconds": 15,
"main_seconds": 380,
"cta_seconds": 25,
"max_scenes": 20,
"scene_duration_range": (10, 30)
}
}
return contexts.get(duration_type, contexts["medium"])

View File

@@ -0,0 +1,412 @@
"""
YouTube Video Renderer Service
Handles video rendering using WAN 2.5 text-to-video and audio generation.
"""
from typing import Dict, Any, List, Optional
from pathlib import Path
import base64
import uuid
import requests
from loguru import logger
from fastapi import HTTPException
from services.wavespeed.client import WaveSpeedClient
from services.llm_providers.main_audio_generation import generate_audio
from services.story_writer.video_generation_service import StoryVideoGenerationService
from services.subscription import PricingService
from services.subscription.preflight_validator import validate_scene_animation_operation
from services.llm_providers.main_video_generation import track_video_usage
from utils.logger_utils import get_service_logger
from utils.asset_tracker import save_asset_to_library
logger = get_service_logger("youtube.renderer")
class YouTubeVideoRendererService:
"""Service for rendering YouTube videos from scenes."""
def __init__(self):
"""Initialize the renderer service."""
self.wavespeed_client = WaveSpeedClient()
# Video output directory
base_dir = Path(__file__).parent.parent.parent.parent
self.output_dir = base_dir / "youtube_videos"
self.output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"[YouTubeRenderer] Initialized with output directory: {self.output_dir}")
def render_scene_video(
self,
scene: Dict[str, Any],
video_plan: Dict[str, Any],
user_id: str,
resolution: str = "720p",
generate_audio_enabled: bool = True,
voice_id: str = "Wise_Woman",
) -> Dict[str, Any]:
"""
Render a single scene into a video.
Args:
scene: Scene data with narration and visual prompts
video_plan: Original video plan for context
user_id: Clerk user ID
resolution: Video resolution (480p, 720p, 1080p)
generate_audio: Whether to generate narration audio
voice_id: Voice ID for audio generation
Returns:
Dictionary with video metadata, bytes, and cost
"""
try:
scene_number = scene.get("scene_number", 1)
narration = scene.get("narration", "").strip()
visual_prompt = (scene.get("enhanced_visual_prompt") or scene.get("visual_prompt", "")).strip()
duration_estimate = scene.get("duration_estimate", 5)
# VALIDATION: Check inputs before making expensive API calls
if not visual_prompt:
raise HTTPException(
status_code=400,
detail={
"error": f"Scene {scene_number} has no visual prompt",
"scene_number": scene_number,
"message": "Visual prompt is required for video generation",
"user_action": "Please add a visual description for this scene before rendering.",
}
)
if len(visual_prompt) < 10:
logger.warning(
f"[YouTubeRenderer] Scene {scene_number} has very short visual prompt "
f"({len(visual_prompt)} chars), may result in poor quality"
)
# Clamp duration to valid WAN 2.5 values (5 or 10 seconds)
duration = 5 if duration_estimate <= 7 else 10
logger.info(
f"[YouTubeRenderer] Rendering scene {scene_number}: "
f"resolution={resolution}, duration={duration}s, prompt_length={len(visual_prompt)}"
)
# Generate audio if requested - only if narration is not empty
audio_base64 = None
if generate_audio_enabled and narration and len(narration.strip()) > 0:
try:
audio_result = generate_audio(
text=narration,
voice_id=voice_id,
user_id=user_id,
)
# generate_audio may return raw bytes or AudioGenerationResult
audio_bytes = audio_result.audio_bytes if hasattr(audio_result, "audio_bytes") else audio_result
# Convert to base64 (just the base64 string, not data URI)
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
logger.info(f"[YouTubeRenderer] Generated audio for scene {scene_number}")
except Exception as e:
logger.warning(f"[YouTubeRenderer] Audio generation failed: {e}, continuing without audio")
# VALIDATION: Final check before expensive video API call
if not visual_prompt or len(visual_prompt.strip()) < 5:
raise HTTPException(
status_code=400,
detail={
"error": f"Scene {scene_number} has invalid visual prompt",
"scene_number": scene_number,
"message": "Visual prompt must be at least 5 characters",
"user_action": "Please provide a valid visual description for this scene.",
}
)
# Generate video using WAN 2.5 text-to-video
# This is the expensive API call - all validation should be done before this
# Use sync mode to wait for result directly (prevents timeout issues)
try:
video_result = self.wavespeed_client.generate_text_video(
prompt=visual_prompt,
resolution=resolution,
duration=duration,
audio_base64=audio_base64, # Optional: enables lip-sync if provided
enable_prompt_expansion=True,
enable_sync_mode=True, # Use sync mode to wait for result directly
timeout=600, # Increased timeout for sync mode (10 minutes)
)
except requests.exceptions.Timeout as e:
logger.error(f"[YouTubeRenderer] WaveSpeed API timed out for scene {scene_number}: {e}")
raise HTTPException(
status_code=504,
detail={
"error": "WaveSpeed request timed out",
"scene_number": scene_number,
"message": "The video generation request timed out.",
"user_action": "Please retry. If it persists, try fewer scenes, lower resolution, or shorter durations.",
},
) from e
except requests.exceptions.RequestException as e:
logger.error(f"[YouTubeRenderer] WaveSpeed API request failed for scene {scene_number}: {e}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed request failed",
"scene_number": scene_number,
"message": str(e),
"user_action": "Please retry. If it persists, check network connectivity or try again later.",
},
) from e
# Save scene video
video_service = StoryVideoGenerationService(output_dir=str(self.output_dir))
save_result = video_service.save_scene_video(
video_bytes=video_result["video_bytes"],
scene_number=scene_number,
user_id=user_id,
)
# Update video URL to use YouTube API endpoint
filename = save_result["video_filename"]
save_result["video_url"] = f"/api/youtube/videos/{filename}"
# Track usage
usage_info = track_video_usage(
user_id=user_id,
provider=video_result["provider"],
model_name=video_result["model_name"],
prompt=visual_prompt,
video_bytes=video_result["video_bytes"],
cost_override=video_result["cost"],
)
logger.info(
f"[YouTubeRenderer] ✅ Scene {scene_number} rendered: "
f"cost=${video_result['cost']:.2f}, size={len(video_result['video_bytes'])} bytes"
)
return {
"scene_number": scene_number,
"video_filename": save_result["video_filename"],
"video_url": save_result["video_url"],
"video_path": save_result["video_path"],
"duration": video_result["duration"],
"cost": video_result["cost"],
"resolution": resolution,
"width": video_result["width"],
"height": video_result["height"],
"file_size": save_result["file_size"],
"prediction_id": video_result.get("prediction_id"),
"usage_info": usage_info,
}
except HTTPException as e:
# Re-raise with better error message for UI
error_detail = e.detail
if isinstance(error_detail, dict):
error_msg = error_detail.get("error", str(error_detail))
else:
error_msg = str(error_detail)
logger.error(
f"[YouTubeRenderer] Scene {scene_number} failed: {error_msg}",
exc_info=True
)
raise HTTPException(
status_code=e.status_code,
detail={
"error": f"Failed to render scene {scene_number}",
"scene_number": scene_number,
"message": error_msg,
"user_action": "Please try again. If the issue persists, check your scene content and try a different resolution.",
}
)
except Exception as e:
logger.error(f"[YouTubeRenderer] Error rendering scene {scene_number}: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail={
"error": f"Failed to render scene {scene_number}",
"scene_number": scene_number,
"message": str(e),
"user_action": "Please try again. If the issue persists, check your scene content and try a different resolution.",
}
)
def render_full_video(
self,
scenes: List[Dict[str, Any]],
video_plan: Dict[str, Any],
user_id: str,
resolution: str = "720p",
combine_scenes: bool = True,
voice_id: str = "Wise_Woman",
) -> Dict[str, Any]:
"""
Render a complete video from multiple scenes.
Args:
scenes: List of scene data
video_plan: Original video plan
user_id: Clerk user ID
resolution: Video resolution
combine_scenes: Whether to combine scenes into single video
voice_id: Voice ID for narration
Returns:
Dictionary with video metadata and scene results
"""
try:
logger.info(
f"[YouTubeRenderer] Rendering full video: {len(scenes)} scenes, "
f"resolution={resolution}, user={user_id}"
)
# Filter enabled scenes
enabled_scenes = [s for s in scenes if s.get("enabled", True)]
if not enabled_scenes:
raise HTTPException(status_code=400, detail="No enabled scenes to render")
scene_results = []
total_cost = 0.0
# Render each scene
for idx, scene in enumerate(enabled_scenes):
logger.info(
f"[YouTubeRenderer] Rendering scene {idx + 1}/{len(enabled_scenes)}: "
f"Scene {scene.get('scene_number', idx + 1)}"
)
scene_result = self.render_scene_video(
scene=scene,
video_plan=video_plan,
user_id=user_id,
resolution=resolution,
generate_audio_enabled=True,
voice_id=voice_id,
)
scene_results.append(scene_result)
total_cost += scene_result["cost"]
# Combine scenes if requested
final_video_path = None
final_video_url = None
if combine_scenes and len(scene_results) > 1:
logger.info("[YouTubeRenderer] Combining scenes into final video...")
# Prepare data for video concatenation
scene_video_paths = [r["video_path"] for r in scene_results]
scene_audio_paths = [r.get("audio_path") for r in scene_results if r.get("audio_path")]
# Use StoryVideoGenerationService to combine
video_service = StoryVideoGenerationService(output_dir=str(self.output_dir))
# Create scene dicts for concatenation
scene_dicts = [
{
"scene_number": r["scene_number"],
"title": f"Scene {r['scene_number']}",
}
for r in scene_results
]
combined_result = video_service.generate_story_video(
scenes=scene_dicts,
image_paths=[None] * len(scene_results), # No static images
audio_paths=scene_audio_paths if scene_audio_paths else [],
video_paths=scene_video_paths, # Use rendered videos
user_id=user_id,
story_title=video_plan.get("video_summary", "YouTube Video")[:50],
fps=24,
)
final_video_path = combined_result["video_path"]
final_video_url = combined_result["video_url"]
logger.info(
f"[YouTubeRenderer] ✅ Full video rendered: {len(scene_results)} scenes, "
f"total_cost=${total_cost:.2f}"
)
return {
"success": True,
"scene_results": scene_results,
"total_cost": total_cost,
"final_video_path": final_video_path,
"final_video_url": final_video_url,
"num_scenes": len(scene_results),
"resolution": resolution,
}
except HTTPException:
raise
except Exception as e:
logger.error(f"[YouTubeRenderer] Error rendering full video: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Failed to render video: {str(e)}"
)
def estimate_render_cost(
self,
scenes: List[Dict[str, Any]],
resolution: str = "720p",
) -> Dict[str, Any]:
"""
Estimate the cost of rendering a video before actually rendering it.
Args:
scenes: List of scene data with duration estimates
resolution: Video resolution (480p, 720p, 1080p)
Returns:
Dictionary with cost breakdown and total estimate
"""
# Pricing per second (same as in WaveSpeedClient)
pricing = {
"480p": 0.05,
"720p": 0.10,
"1080p": 0.15,
}
price_per_second = pricing.get(resolution, 0.10)
# Filter enabled scenes
enabled_scenes = [s for s in scenes if s.get("enabled", True)]
scene_costs = []
total_cost = 0.0
total_duration = 0.0
for scene in enabled_scenes:
scene_number = scene.get("scene_number", 0)
duration_estimate = scene.get("duration_estimate", 5)
# Clamp duration to valid WAN 2.5 values (5 or 10 seconds)
duration = 5 if duration_estimate <= 7 else 10
scene_cost = price_per_second * duration
scene_costs.append({
"scene_number": scene_number,
"duration_estimate": duration_estimate,
"actual_duration": duration,
"cost": round(scene_cost, 2),
})
total_cost += scene_cost
total_duration += duration
return {
"resolution": resolution,
"price_per_second": price_per_second,
"num_scenes": len(enabled_scenes),
"total_duration_seconds": total_duration,
"scene_costs": scene_costs,
"total_cost": round(total_cost, 2),
"estimated_cost_range": {
"min": round(total_cost * 0.9, 2), # 10% buffer
"max": round(total_cost * 1.1, 2), # 10% buffer
},
}

View File

@@ -0,0 +1,551 @@
"""
YouTube Scene Builder Service
Converts video plans into structured scenes with narration, visual prompts, and timing.
"""
from typing import Dict, Any, Optional, List
from loguru import logger
from fastapi import HTTPException
from services.llm_providers.main_text_generation import llm_text_gen
from services.story_writer.prompt_enhancer_service import PromptEnhancerService
from utils.logger_utils import get_service_logger
logger = get_service_logger("youtube.scene_builder")
class YouTubeSceneBuilderService:
"""Service for building structured video scenes from plans."""
def __init__(self):
"""Initialize the scene builder service."""
self.prompt_enhancer = PromptEnhancerService()
logger.info("[YouTubeSceneBuilder] Service initialized")
def build_scenes_from_plan(
self,
video_plan: Dict[str, Any],
user_id: str,
custom_script: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Build structured scenes from a video plan.
Args:
video_plan: Video plan from planner service
user_id: Clerk user ID for subscription checking
custom_script: Optional custom script to use instead of generating
Returns:
List of scene dictionaries with narration, visual prompts, timing, etc.
"""
try:
logger.info(
f"[YouTubeSceneBuilder] Building scenes from plan: "
f"duration={video_plan.get('duration_type')}, "
f"sections={len(video_plan.get('content_outline', []))}"
)
duration_metadata = video_plan.get("duration_metadata", {})
max_scenes = duration_metadata.get("max_scenes", 10)
# If custom script provided, parse it into scenes
if custom_script:
scenes = self._parse_custom_script(
custom_script, video_plan, duration_metadata, user_id
)
# For shorts, check if scenes were already generated in plan (optimization)
elif video_plan.get("_scenes_included") and video_plan.get("duration_type") == "shorts":
prebuilt = video_plan.get("scenes") or []
if prebuilt:
logger.info(
f"[YouTubeSceneBuilder] Using scenes from optimized plan+scenes call "
f"({len(prebuilt)} scenes)"
)
scenes = self._normalize_scenes_from_plan(video_plan, duration_metadata)
else:
logger.warning(
"[YouTubeSceneBuilder] Plan marked _scenes_included but no scenes present; "
"regenerating scenes normally."
)
scenes = self._generate_scenes_from_plan(
video_plan, duration_metadata, user_id
)
else:
# Generate scenes from plan
scenes = self._generate_scenes_from_plan(
video_plan, duration_metadata, user_id
)
# Limit to max scenes
if len(scenes) > max_scenes:
logger.warning(
f"[YouTubeSceneBuilder] Truncating {len(scenes)} scenes to {max_scenes}"
)
scenes = scenes[:max_scenes]
# Enhance visual prompts efficiently based on duration type
duration_type = video_plan.get("duration_type", "medium")
scenes = self._enhance_visual_prompts_batch(
scenes, video_plan, user_id, duration_type
)
logger.info(f"[YouTubeSceneBuilder] ✅ Built {len(scenes)} scenes")
return scenes
except HTTPException:
raise
except Exception as e:
logger.error(f"[YouTubeSceneBuilder] Error building scenes: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Failed to build scenes: {str(e)}"
)
def _generate_scenes_from_plan(
self,
video_plan: Dict[str, Any],
duration_metadata: Dict[str, Any],
user_id: str,
) -> List[Dict[str, Any]]:
"""Generate scenes from video plan using AI."""
content_outline = video_plan.get("content_outline", [])
hook_strategy = video_plan.get("hook_strategy", "")
call_to_action = video_plan.get("call_to_action", "")
visual_style = video_plan.get("visual_style", "cinematic")
tone = video_plan.get("tone", "professional")
scene_duration_range = duration_metadata.get("scene_duration_range", (5, 15))
scene_generation_prompt = f"""You are an expert video scriptwriter. Create detailed scenes for a YouTube video based on this plan.
**Video Plan:**
- Summary: {video_plan.get('video_summary', '')}
- Goal: {video_plan.get('video_goal', '')}
- Key Message: {video_plan.get('key_message', '')}
- Visual Style: {visual_style}
- Tone: {tone}
**Hook Strategy:**
{hook_strategy}
**Content Outline:**
{chr(10).join([f"- {section.get('section', '')}: {section.get('description', '')} ({section.get('duration_estimate', 0)}s)" for section in content_outline])}
**Call-to-Action:**
{call_to_action}
**Duration Constraints:**
- Scene duration: {scene_duration_range[0]}-{scene_duration_range[1]} seconds each
- Total target: {duration_metadata.get('target_seconds', 150)} seconds
**Your Task:**
Create detailed scenes that include:
1. Scene number and title
2. Narration text (what will be spoken)
3. Visual description (what viewers will see)
4. Duration estimate
5. Emphasis tags (hook, main_content, transition, cta)
**Format as JSON array:**
[
{{
"scene_number": 1,
"title": "Hook - Attention Grabber",
"narration": "The spoken text for this scene...",
"visual_description": "Detailed description of what viewers see...",
"duration_estimate": 5,
"emphasis": "hook",
"visual_cues": ["close-up", "dynamic", "bright"]
}},
...
]
Make sure:
- First scene is a strong hook ({duration_metadata.get('hook_seconds', 10)}s)
- Last scene includes the CTA ({duration_metadata.get('cta_seconds', 10)}s)
- Each scene has clear narration and visual description
- Total duration fits within {duration_metadata.get('target_seconds', 150)} seconds
- Scenes flow naturally from one to the next
"""
system_prompt = (
"You are an expert video scriptwriter specializing in YouTube content. "
"Your scenes are engaging, well-paced, and optimized for viewer retention."
)
response = llm_text_gen(
prompt=scene_generation_prompt,
system_prompt=system_prompt,
user_id=user_id,
json_struct={
"type": "array",
"items": {
"type": "object",
"properties": {
"scene_number": {"type": "number"},
"title": {"type": "string"},
"narration": {"type": "string"},
"visual_description": {"type": "string"},
"duration_estimate": {"type": "number"},
"emphasis": {"type": "string"},
"visual_cues": {
"type": "array",
"items": {"type": "string"}
}
},
"required": [
"scene_number", "title", "narration", "visual_description",
"duration_estimate", "emphasis"
]
}
}
)
# Parse response
if isinstance(response, list):
scenes = response
elif isinstance(response, dict) and "scenes" in response:
scenes = response["scenes"]
else:
import json
scenes = json.loads(response) if isinstance(response, str) else response
# Normalize scene data
normalized_scenes = []
for idx, scene in enumerate(scenes, 1):
normalized_scenes.append({
"scene_number": scene.get("scene_number", idx),
"title": scene.get("title", f"Scene {idx}"),
"narration": scene.get("narration", ""),
"visual_description": scene.get("visual_description", ""),
"duration_estimate": scene.get("duration_estimate", scene_duration_range[0]),
"emphasis": scene.get("emphasis", "main_content"),
"visual_cues": scene.get("visual_cues", []),
"visual_prompt": scene.get("visual_description", ""), # Initial prompt
})
return normalized_scenes
def _normalize_scenes_from_plan(
self,
video_plan: Dict[str, Any],
duration_metadata: Dict[str, Any],
) -> List[Dict[str, Any]]:
"""Normalize scenes that were generated as part of the plan (optimization for shorts)."""
scenes = video_plan.get("scenes", [])
scene_duration_range = duration_metadata.get("scene_duration_range", (2, 8))
normalized_scenes = []
for idx, scene in enumerate(scenes, 1):
normalized_scenes.append({
"scene_number": scene.get("scene_number", idx),
"title": scene.get("title", f"Scene {idx}"),
"narration": scene.get("narration", ""),
"visual_description": scene.get("visual_description", ""),
"duration_estimate": scene.get("duration_estimate", scene_duration_range[0]),
"emphasis": scene.get("emphasis", "main_content"),
"visual_cues": scene.get("visual_cues", []),
"visual_prompt": scene.get("visual_description", ""), # Initial prompt
})
logger.info(
f"[YouTubeSceneBuilder] ✅ Normalized {len(normalized_scenes)} scenes "
f"from optimized plan (saved 1 AI call)"
)
return normalized_scenes
def _parse_custom_script(
self,
custom_script: str,
video_plan: Dict[str, Any],
duration_metadata: Dict[str, Any],
user_id: str,
) -> List[Dict[str, Any]]:
"""Parse a custom script into structured scenes."""
# Simple parsing: split by double newlines or scene markers
import re
# Try to detect scene markers
scene_pattern = r'(?:Scene\s+\d+|#\s*\d+\.|^\d+\.)\s*(.+?)(?=(?:Scene\s+\d+|#\s*\d+\.|^\d+\.|$))'
matches = re.finditer(scene_pattern, custom_script, re.MULTILINE | re.DOTALL)
scenes = []
for idx, match in enumerate(matches, 1):
scene_text = match.group(1).strip()
# Extract narration (first paragraph or before visual markers)
narration_match = re.search(r'^(.*?)(?:\n\n|Visual:|Image:)', scene_text, re.DOTALL)
narration = narration_match.group(1).strip() if narration_match else scene_text.split('\n')[0]
# Extract visual description
visual_match = re.search(r'(?:Visual:|Image:)\s*(.+?)(?:\n\n|$)', scene_text, re.DOTALL)
visual_description = visual_match.group(1).strip() if visual_match else narration
scenes.append({
"scene_number": idx,
"title": f"Scene {idx}",
"narration": narration,
"visual_description": visual_description,
"duration_estimate": duration_metadata.get("scene_duration_range", [5, 15])[0],
"emphasis": "hook" if idx == 1 else ("cta" if idx == len(list(matches)) else "main_content"),
"visual_cues": [],
"visual_prompt": visual_description,
})
# Fallback: split by paragraphs if no scene markers
if not scenes:
paragraphs = [p.strip() for p in custom_script.split('\n\n') if p.strip()]
for idx, para in enumerate(paragraphs[:duration_metadata.get("max_scenes", 10)], 1):
scenes.append({
"scene_number": idx,
"title": f"Scene {idx}",
"narration": para,
"visual_description": para,
"duration_estimate": duration_metadata.get("scene_duration_range", [5, 15])[0],
"emphasis": "hook" if idx == 1 else ("cta" if idx == len(paragraphs) else "main_content"),
"visual_cues": [],
"visual_prompt": para,
})
return scenes
def _enhance_visual_prompts_batch(
self,
scenes: List[Dict[str, Any]],
video_plan: Dict[str, Any],
user_id: str,
duration_type: str,
) -> List[Dict[str, Any]]:
"""
Efficiently enhance visual prompts based on video duration type.
Strategy:
- Shorts: Skip enhancement (use original descriptions) - 0 AI calls
- Medium: Batch enhance all scenes in 1 call - 1 AI call
- Long: Batch enhance in 2 calls (split scenes) - 2 AI calls max
"""
# For shorts, skip enhancement to save API calls
if duration_type == "shorts":
logger.info(
f"[YouTubeSceneBuilder] Skipping prompt enhancement for shorts "
f"({len(scenes)} scenes) to save API calls"
)
for scene in scenes:
scene["enhanced_visual_prompt"] = scene.get(
"visual_prompt", scene.get("visual_description", "")
)
return scenes
# Build story context for prompt enhancer
story_context = {
"story_setting": video_plan.get("visual_style", "cinematic"),
"story_tone": video_plan.get("tone", "professional"),
"writing_style": video_plan.get("visual_style", "cinematic"),
}
# Convert scenes to format expected by enhancer
scene_data_list = [
{
"scene_number": scene.get("scene_number", idx + 1),
"title": scene.get("title", ""),
"description": scene.get("visual_description", ""),
"image_prompt": scene.get("visual_prompt", ""),
}
for idx, scene in enumerate(scenes)
]
# For medium videos, enhance all scenes in one batch call
if duration_type == "medium":
logger.info(
f"[YouTubeSceneBuilder] Batch enhancing {len(scenes)} scenes "
f"for medium video in 1 AI call"
)
try:
# Use a single batch enhancement call
enhanced_prompts = self._batch_enhance_prompts(
scene_data_list, story_context, user_id
)
for idx, scene in enumerate(scenes):
scene["enhanced_visual_prompt"] = enhanced_prompts.get(
idx, scene.get("visual_prompt", scene.get("visual_description", ""))
)
except Exception as e:
logger.warning(
f"[YouTubeSceneBuilder] Batch enhancement failed: {e}, "
f"using original prompts"
)
for scene in scenes:
scene["enhanced_visual_prompt"] = scene.get(
"visual_prompt", scene.get("visual_description", "")
)
return scenes
# For long videos, split into 2 batches to avoid token limits
if duration_type == "long":
logger.info(
f"[YouTubeSceneBuilder] Batch enhancing {len(scenes)} scenes "
f"for long video in 2 AI calls"
)
mid_point = len(scenes) // 2
batches = [
scene_data_list[:mid_point],
scene_data_list[mid_point:],
]
all_enhanced = {}
for batch_idx, batch in enumerate(batches):
try:
enhanced = self._batch_enhance_prompts(
batch, story_context, user_id
)
start_idx = 0 if batch_idx == 0 else mid_point
for local_idx, enhanced_prompt in enhanced.items():
all_enhanced[start_idx + local_idx] = enhanced_prompt
except Exception as e:
logger.warning(
f"[YouTubeSceneBuilder] Batch {batch_idx + 1} enhancement "
f"failed: {e}, using original prompts"
)
start_idx = 0 if batch_idx == 0 else mid_point
for local_idx, scene_data in enumerate(batch):
all_enhanced[start_idx + local_idx] = scene_data.get(
"image_prompt", scene_data.get("description", "")
)
for idx, scene in enumerate(scenes):
scene["enhanced_visual_prompt"] = all_enhanced.get(
idx, scene.get("visual_prompt", scene.get("visual_description", ""))
)
return scenes
# Fallback: use original prompts
logger.warning(
f"[YouTubeSceneBuilder] Unknown duration type '{duration_type}', "
f"using original prompts"
)
for scene in scenes:
scene["enhanced_visual_prompt"] = scene.get(
"visual_prompt", scene.get("visual_description", "")
)
return scenes
def _batch_enhance_prompts(
self,
scene_data_list: List[Dict[str, Any]],
story_context: Dict[str, Any],
user_id: str,
) -> Dict[int, str]:
"""
Enhance multiple scene prompts in a single AI call.
Returns:
Dictionary mapping scene index to enhanced prompt
"""
try:
# Build batch enhancement prompt
scenes_text = "\n\n".join([
f"Scene {scene.get('scene_number', idx + 1)}: {scene.get('title', '')}\n"
f"Description: {scene.get('description', '')}\n"
f"Current Prompt: {scene.get('image_prompt', '')}"
for idx, scene in enumerate(scene_data_list)
])
batch_prompt = f"""You are optimizing visual prompts for AI video generation. Enhance the following scenes to be more detailed and video-optimized.
**Video Style Context:**
- Setting: {story_context.get('story_setting', 'cinematic')}
- Tone: {story_context.get('story_tone', 'professional')}
- Style: {story_context.get('writing_style', 'cinematic')}
**Scenes to Enhance:**
{scenes_text}
**Your Task:**
For each scene, create an enhanced visual prompt (200-300 words) that:
1. Is detailed and specific for video generation
2. Includes camera movements, lighting, composition
3. Maintains consistency with the video style
4. Is optimized for WAN 2.5 text-to-video model
**Format as JSON array with enhanced prompts:**
[
{{"scene_index": 0, "enhanced_prompt": "detailed enhanced prompt for scene 1..."}},
{{"scene_index": 1, "enhanced_prompt": "detailed enhanced prompt for scene 2..."}},
...
]
Make sure the array length matches the number of scenes provided ({len(scene_data_list)}).
"""
system_prompt = (
"You are an expert at creating detailed visual prompts for AI video generation. "
"Your prompts are specific, cinematic, and optimized for video models."
)
response = llm_text_gen(
prompt=batch_prompt,
system_prompt=system_prompt,
user_id=user_id,
json_struct={
"type": "array",
"items": {
"type": "object",
"properties": {
"scene_index": {"type": "number"},
"enhanced_prompt": {"type": "string"}
},
"required": ["scene_index", "enhanced_prompt"]
}
}
)
# Parse response
if isinstance(response, list):
enhanced_list = response
elif isinstance(response, str):
import json
enhanced_list = json.loads(response)
else:
enhanced_list = response
# Build result dictionary
result = {}
for item in enhanced_list:
idx = item.get("scene_index", 0)
prompt = item.get("enhanced_prompt", "")
if prompt:
result[idx] = prompt
else:
# Fallback to original
original_scene = scene_data_list[idx] if idx < len(scene_data_list) else {}
result[idx] = original_scene.get(
"image_prompt", original_scene.get("description", "")
)
# Fill in any missing scenes with original prompts
for idx in range(len(scene_data_list)):
if idx not in result:
original_scene = scene_data_list[idx]
result[idx] = original_scene.get(
"image_prompt", original_scene.get("description", "")
)
logger.info(
f"[YouTubeSceneBuilder] ✅ Batch enhanced {len(result)} prompts "
f"in 1 AI call"
)
return result
except Exception as e:
logger.error(
f"[YouTubeSceneBuilder] Batch enhancement failed: {e}",
exc_info=True
)
# Return original prompts as fallback
return {
idx: scene.get("image_prompt", scene.get("description", ""))
for idx, scene in enumerate(scene_data_list)
}