Files
Kunthawat Greethong c35fa52117 Base code
2026-01-08 22:39:53 +07:00

599 lines
25 KiB
Python

"""
YouTube Scene Builder Service
Converts video plans into structured scenes with narration, visual prompts, and timing.
"""
from typing import Dict, Any, Optional, List
from loguru import logger
from fastapi import HTTPException
from services.llm_providers.main_text_generation import llm_text_gen
from services.story_writer.prompt_enhancer_service import PromptEnhancerService
from utils.logger_utils import get_service_logger
logger = get_service_logger("youtube.scene_builder")
class YouTubeSceneBuilderService:
"""Service for building structured video scenes from plans."""
def __init__(self):
"""Initialize the scene builder service."""
self.prompt_enhancer = PromptEnhancerService()
logger.info("[YouTubeSceneBuilder] Service initialized")
def build_scenes_from_plan(
self,
video_plan: Dict[str, Any],
user_id: str,
custom_script: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Build structured scenes from a video plan.
This method is optimized to minimize AI calls:
- For shorts: Reuses scenes if already generated in plan (0 AI calls)
- For medium/long: Generates scenes + batch enhances (1-3 AI calls total)
- Custom script: Parses script without AI calls (0 AI calls)
Args:
video_plan: Video plan from planner service
user_id: Clerk user ID for subscription checking
custom_script: Optional custom script to use instead of generating
Returns:
List of scene dictionaries with narration, visual prompts, timing, etc.
"""
try:
duration_type = video_plan.get('duration_type', 'medium')
logger.info(
f"[YouTubeSceneBuilder] Building scenes from plan: "
f"duration={duration_type}, "
f"sections={len(video_plan.get('content_outline', []))}, "
f"user={user_id}"
)
duration_metadata = video_plan.get("duration_metadata", {})
max_scenes = duration_metadata.get("max_scenes", 10)
# Optimization: Check if scenes already exist in plan (prevents duplicate generation)
# This can happen if plan was generated with include_scenes=True for shorts
existing_scenes = video_plan.get("scenes", [])
if existing_scenes and video_plan.get("_scenes_included"):
# Scenes already generated in plan - reuse them (0 AI calls)
logger.info(
f"[YouTubeSceneBuilder] ♻️ Reusing {len(existing_scenes)} scenes from plan "
f"(duration={duration_type}) - skipping generation to save AI calls"
)
scenes = self._normalize_scenes_from_plan(video_plan, duration_metadata)
# If custom script provided, parse it into scenes (0 AI calls for parsing)
elif custom_script:
logger.info(
f"[YouTubeSceneBuilder] Parsing custom script for scene generation "
f"(0 AI calls required)"
)
scenes = self._parse_custom_script(
custom_script, video_plan, duration_metadata, user_id
)
# For shorts, check if scenes were already generated in plan (optimization)
elif video_plan.get("_scenes_included") and duration_type == "shorts":
prebuilt = video_plan.get("scenes") or []
if prebuilt:
logger.info(
f"[YouTubeSceneBuilder] Using scenes from optimized plan+scenes call "
f"({len(prebuilt)} scenes)"
)
scenes = self._normalize_scenes_from_plan(video_plan, duration_metadata)
else:
logger.warning(
"[YouTubeSceneBuilder] Plan marked _scenes_included but no scenes present; "
"regenerating scenes normally."
)
scenes = self._generate_scenes_from_plan(
video_plan, duration_metadata, user_id
)
else:
# Generate scenes from plan
scenes = self._generate_scenes_from_plan(
video_plan, duration_metadata, user_id
)
# Limit to max scenes
if len(scenes) > max_scenes:
logger.warning(
f"[YouTubeSceneBuilder] Truncating {len(scenes)} scenes to {max_scenes}"
)
scenes = scenes[:max_scenes]
# Enhance visual prompts efficiently based on duration type
duration_type = video_plan.get("duration_type", "medium")
scenes = self._enhance_visual_prompts_batch(
scenes, video_plan, user_id, duration_type
)
logger.info(f"[YouTubeSceneBuilder] ✅ Built {len(scenes)} scenes")
return scenes
except HTTPException:
raise
except Exception as e:
logger.error(f"[YouTubeSceneBuilder] Error building scenes: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Failed to build scenes: {str(e)}"
)
def _generate_scenes_from_plan(
self,
video_plan: Dict[str, Any],
duration_metadata: Dict[str, Any],
user_id: str,
) -> List[Dict[str, Any]]:
"""Generate scenes from video plan using AI."""
content_outline = video_plan.get("content_outline", [])
hook_strategy = video_plan.get("hook_strategy", "")
call_to_action = video_plan.get("call_to_action", "")
visual_style = video_plan.get("visual_style", "cinematic")
tone = video_plan.get("tone", "professional")
scene_duration_range = duration_metadata.get("scene_duration_range", (5, 15))
scene_generation_prompt = f"""You are a top YouTube scriptwriter specializing in engaging, viral content. Create compelling scenes that captivate viewers and maximize watch time.
**VIDEO PLAN:**
📝 Summary: {video_plan.get('video_summary', '')}
🎯 Goal: {video_plan.get('video_goal', '')}
💡 Key Message: {video_plan.get('key_message', '')}
🎨 Visual Style: {visual_style}
🎭 Tone: {tone}
**🎣 HOOK STRATEGY:**
{hook_strategy}
**📋 CONTENT STRUCTURE:**
{chr(10).join([f"{section.get('section', '')}: {section.get('description', '')} ({section.get('duration_estimate', 0)}s)" for section in content_outline])}
**🚀 CALL-TO-ACTION:**
{call_to_action}
**⏱️ TIMING CONSTRAINTS:**
• Scene duration: {scene_duration_range[0]}-{scene_duration_range[1]} seconds each
• Total target: {duration_metadata.get('target_seconds', 150)} seconds
**🎬 YOUR MISSION - CREATE VIRAL-WORTHY SCENES:**
Write narration that:
✨ **HOOKS IMMEDIATELY** - First {duration_metadata.get('hook_seconds', 10)}s must GRAB attention
🎭 **TELLS A STORY** - Each scene advances the narrative with emotional engagement
💡 **DELIVERS VALUE** - Provide insights, tips, or "aha!" moments in every scene
🔥 **BUILDS EXCITEMENT** - Use power words, questions, and cliffhangers
👥 **CONNECTS PERSONALLY** - Speak directly to the viewer's needs and desires
⚡ **MAINTAINS PACE** - Vary sentence length for natural rhythm
🎯 **DRIVES ACTION** - Build toward the CTA with increasing urgency
**REQUIRED SCENE ELEMENTS:**
1. **scene_number**: Sequential numbering
2. **title**: Catchy, descriptive title (5-8 words max)
3. **narration**: ENGAGING spoken script with:
- Conversational language ("you know what I mean?")
- Rhetorical questions ("Have you ever wondered...?")
- Power transitions ("But here's the game-changer...")
- Emotional hooks ("Imagine this...")
- Action-oriented language ("Let's dive in...")
4. **visual_description**: Cinematic, professional YouTube visuals
5. **duration_estimate**: Realistic speaking time
6. **emphasis**: hook/main_content/transition/cta
7. **visual_cues**: ["dramatic_zoom", "text_overlay", "fast_cuts"]
**🎯 YOUTUBE OPTIMIZATION RULES:**
• **Hook Power**: First 3 seconds = make them stay or lose them
• **Value Density**: Every 10 seconds must deliver new insight
• **Emotional Arc**: Build curiosity → teach → inspire → convert
• **Natural Flow**: Scenes must connect seamlessly
• **CTA Momentum**: Final scene creates irresistible urge to act
**📊 FORMAT AS JSON ARRAY:**
[
{{
"scene_number": 1,
"title": "The Shocking Truth They Hide",
"narration": "You won't believe what just happened in my latest discovery! I was scrolling through the usual content when BAM - this completely changed everything I thought about [topic]. And get this - it could transform YOUR results too!",
"visual_description": "Dynamic opening shot with shocking text overlay, fast cuts of social media feeds, energetic music swell, close-up of surprised reaction",
"duration_estimate": 8,
"emphasis": "hook",
"visual_cues": ["shocking_text", "fast_cuts", "music_swell", "reaction_shot"]
}},
...
]
**🔥 SUCCESS CRITERIA:**
✅ First scene hooks in 3 seconds
✅ Each scene delivers 1-2 key insights
✅ Narration feels like talking to a friend
✅ Total story arc creates emotional journey
✅ CTA feels like the natural next step
✅ Scenes fit duration perfectly"""
system_prompt = (
"You are a master YouTube scriptwriter who creates viral, engaging content that "
"keeps viewers watching until the end. You understand YouTube algorithm optimization, "
"emotional storytelling, and creating irresistible hooks that make viewers hit 'like' and 'subscribe'. "
"Your scripts are conversational, valuable, and conversion-focused."
)
response = llm_text_gen(
prompt=scene_generation_prompt,
system_prompt=system_prompt,
user_id=user_id,
json_struct={
"type": "array",
"items": {
"type": "object",
"properties": {
"scene_number": {"type": "number"},
"title": {"type": "string"},
"narration": {"type": "string"},
"visual_description": {"type": "string"},
"duration_estimate": {"type": "number"},
"emphasis": {"type": "string"},
"visual_cues": {
"type": "array",
"items": {"type": "string"}
}
},
"required": [
"scene_number", "title", "narration", "visual_description",
"duration_estimate", "emphasis"
]
}
}
)
# Parse response
if isinstance(response, list):
scenes = response
elif isinstance(response, dict) and "scenes" in response:
scenes = response["scenes"]
else:
import json
scenes = json.loads(response) if isinstance(response, str) else response
# Normalize scene data
normalized_scenes = []
for idx, scene in enumerate(scenes, 1):
normalized_scenes.append({
"scene_number": scene.get("scene_number", idx),
"title": scene.get("title", f"Scene {idx}"),
"narration": scene.get("narration", ""),
"visual_description": scene.get("visual_description", ""),
"duration_estimate": scene.get("duration_estimate", scene_duration_range[0]),
"emphasis": scene.get("emphasis", "main_content"),
"visual_cues": scene.get("visual_cues", []),
"visual_prompt": scene.get("visual_description", ""), # Initial prompt
})
return normalized_scenes
def _normalize_scenes_from_plan(
self,
video_plan: Dict[str, Any],
duration_metadata: Dict[str, Any],
) -> List[Dict[str, Any]]:
"""Normalize scenes that were generated as part of the plan (optimization for shorts)."""
scenes = video_plan.get("scenes", [])
scene_duration_range = duration_metadata.get("scene_duration_range", (2, 8))
normalized_scenes = []
for idx, scene in enumerate(scenes, 1):
normalized_scenes.append({
"scene_number": scene.get("scene_number", idx),
"title": scene.get("title", f"Scene {idx}"),
"narration": scene.get("narration", ""),
"visual_description": scene.get("visual_description", ""),
"duration_estimate": scene.get("duration_estimate", scene_duration_range[0]),
"emphasis": scene.get("emphasis", "main_content"),
"visual_cues": scene.get("visual_cues", []),
"visual_prompt": scene.get("visual_description", ""), # Initial prompt
})
logger.info(
f"[YouTubeSceneBuilder] ✅ Normalized {len(normalized_scenes)} scenes "
f"from optimized plan (saved 1 AI call)"
)
return normalized_scenes
def _parse_custom_script(
self,
custom_script: str,
video_plan: Dict[str, Any],
duration_metadata: Dict[str, Any],
user_id: str,
) -> List[Dict[str, Any]]:
"""Parse a custom script into structured scenes."""
# Simple parsing: split by double newlines or scene markers
import re
# Try to detect scene markers
scene_pattern = r'(?:Scene\s+\d+|#\s*\d+\.|^\d+\.)\s*(.+?)(?=(?:Scene\s+\d+|#\s*\d+\.|^\d+\.|$))'
matches = re.finditer(scene_pattern, custom_script, re.MULTILINE | re.DOTALL)
scenes = []
for idx, match in enumerate(matches, 1):
scene_text = match.group(1).strip()
# Extract narration (first paragraph or before visual markers)
narration_match = re.search(r'^(.*?)(?:\n\n|Visual:|Image:)', scene_text, re.DOTALL)
narration = narration_match.group(1).strip() if narration_match else scene_text.split('\n')[0]
# Extract visual description
visual_match = re.search(r'(?:Visual:|Image:)\s*(.+?)(?:\n\n|$)', scene_text, re.DOTALL)
visual_description = visual_match.group(1).strip() if visual_match else narration
scenes.append({
"scene_number": idx,
"title": f"Scene {idx}",
"narration": narration,
"visual_description": visual_description,
"duration_estimate": duration_metadata.get("scene_duration_range", [5, 15])[0],
"emphasis": "hook" if idx == 1 else ("cta" if idx == len(list(matches)) else "main_content"),
"visual_cues": [],
"visual_prompt": visual_description,
})
# Fallback: split by paragraphs if no scene markers
if not scenes:
paragraphs = [p.strip() for p in custom_script.split('\n\n') if p.strip()]
for idx, para in enumerate(paragraphs[:duration_metadata.get("max_scenes", 10)], 1):
scenes.append({
"scene_number": idx,
"title": f"Scene {idx}",
"narration": para,
"visual_description": para,
"duration_estimate": duration_metadata.get("scene_duration_range", [5, 15])[0],
"emphasis": "hook" if idx == 1 else ("cta" if idx == len(paragraphs) else "main_content"),
"visual_cues": [],
"visual_prompt": para,
})
return scenes
def _enhance_visual_prompts_batch(
self,
scenes: List[Dict[str, Any]],
video_plan: Dict[str, Any],
user_id: str,
duration_type: str,
) -> List[Dict[str, Any]]:
"""
Efficiently enhance visual prompts based on video duration type.
Strategy:
- Shorts: Skip enhancement (use original descriptions) - 0 AI calls
- Medium: Batch enhance all scenes in 1 call - 1 AI call
- Long: Batch enhance in 2 calls (split scenes) - 2 AI calls max
"""
# For shorts, skip enhancement to save API calls
if duration_type == "shorts":
logger.info(
f"[YouTubeSceneBuilder] Skipping prompt enhancement for shorts "
f"({len(scenes)} scenes) to save API calls"
)
for scene in scenes:
scene["enhanced_visual_prompt"] = scene.get(
"visual_prompt", scene.get("visual_description", "")
)
return scenes
# Build story context for prompt enhancer
story_context = {
"story_setting": video_plan.get("visual_style", "cinematic"),
"story_tone": video_plan.get("tone", "professional"),
"writing_style": video_plan.get("visual_style", "cinematic"),
}
# Convert scenes to format expected by enhancer
scene_data_list = [
{
"scene_number": scene.get("scene_number", idx + 1),
"title": scene.get("title", ""),
"description": scene.get("visual_description", ""),
"image_prompt": scene.get("visual_prompt", ""),
}
for idx, scene in enumerate(scenes)
]
# For medium videos, enhance all scenes in one batch call
if duration_type == "medium":
logger.info(
f"[YouTubeSceneBuilder] Batch enhancing {len(scenes)} scenes "
f"for medium video in 1 AI call"
)
try:
# Use a single batch enhancement call
enhanced_prompts = self._batch_enhance_prompts(
scene_data_list, story_context, user_id
)
for idx, scene in enumerate(scenes):
scene["enhanced_visual_prompt"] = enhanced_prompts.get(
idx, scene.get("visual_prompt", scene.get("visual_description", ""))
)
except Exception as e:
logger.warning(
f"[YouTubeSceneBuilder] Batch enhancement failed: {e}, "
f"using original prompts"
)
for scene in scenes:
scene["enhanced_visual_prompt"] = scene.get(
"visual_prompt", scene.get("visual_description", "")
)
return scenes
# For long videos, split into 2 batches to avoid token limits
if duration_type == "long":
logger.info(
f"[YouTubeSceneBuilder] Batch enhancing {len(scenes)} scenes "
f"for long video in 2 AI calls"
)
mid_point = len(scenes) // 2
batches = [
scene_data_list[:mid_point],
scene_data_list[mid_point:],
]
all_enhanced = {}
for batch_idx, batch in enumerate(batches):
try:
enhanced = self._batch_enhance_prompts(
batch, story_context, user_id
)
start_idx = 0 if batch_idx == 0 else mid_point
for local_idx, enhanced_prompt in enhanced.items():
all_enhanced[start_idx + local_idx] = enhanced_prompt
except Exception as e:
logger.warning(
f"[YouTubeSceneBuilder] Batch {batch_idx + 1} enhancement "
f"failed: {e}, using original prompts"
)
start_idx = 0 if batch_idx == 0 else mid_point
for local_idx, scene_data in enumerate(batch):
all_enhanced[start_idx + local_idx] = scene_data.get(
"image_prompt", scene_data.get("description", "")
)
for idx, scene in enumerate(scenes):
scene["enhanced_visual_prompt"] = all_enhanced.get(
idx, scene.get("visual_prompt", scene.get("visual_description", ""))
)
return scenes
# Fallback: use original prompts
logger.warning(
f"[YouTubeSceneBuilder] Unknown duration type '{duration_type}', "
f"using original prompts"
)
for scene in scenes:
scene["enhanced_visual_prompt"] = scene.get(
"visual_prompt", scene.get("visual_description", "")
)
return scenes
def _batch_enhance_prompts(
self,
scene_data_list: List[Dict[str, Any]],
story_context: Dict[str, Any],
user_id: str,
) -> Dict[int, str]:
"""
Enhance multiple scene prompts in a single AI call.
Returns:
Dictionary mapping scene index to enhanced prompt
"""
try:
# Build batch enhancement prompt
scenes_text = "\n\n".join([
f"Scene {scene.get('scene_number', idx + 1)}: {scene.get('title', '')}\n"
f"Description: {scene.get('description', '')}\n"
f"Current Prompt: {scene.get('image_prompt', '')}"
for idx, scene in enumerate(scene_data_list)
])
batch_prompt = f"""You are optimizing visual prompts for AI video generation. Enhance the following scenes to be more detailed and video-optimized.
**Video Style Context:**
- Setting: {story_context.get('story_setting', 'cinematic')}
- Tone: {story_context.get('story_tone', 'professional')}
- Style: {story_context.get('writing_style', 'cinematic')}
**Scenes to Enhance:**
{scenes_text}
**Your Task:**
For each scene, create an enhanced visual prompt (200-300 words) that:
1. Is detailed and specific for video generation
2. Includes camera movements, lighting, composition
3. Maintains consistency with the video style
4. Is optimized for WAN 2.5 text-to-video model
**Format as JSON array with enhanced prompts:**
[
{{"scene_index": 0, "enhanced_prompt": "detailed enhanced prompt for scene 1..."}},
{{"scene_index": 1, "enhanced_prompt": "detailed enhanced prompt for scene 2..."}},
...
]
Make sure the array length matches the number of scenes provided ({len(scene_data_list)}).
"""
system_prompt = (
"You are an expert at creating detailed visual prompts for AI video generation. "
"Your prompts are specific, cinematic, and optimized for video models."
)
response = llm_text_gen(
prompt=batch_prompt,
system_prompt=system_prompt,
user_id=user_id,
json_struct={
"type": "array",
"items": {
"type": "object",
"properties": {
"scene_index": {"type": "number"},
"enhanced_prompt": {"type": "string"}
},
"required": ["scene_index", "enhanced_prompt"]
}
}
)
# Parse response
if isinstance(response, list):
enhanced_list = response
elif isinstance(response, str):
import json
enhanced_list = json.loads(response)
else:
enhanced_list = response
# Build result dictionary
result = {}
for item in enhanced_list:
idx = item.get("scene_index", 0)
prompt = item.get("enhanced_prompt", "")
if prompt:
result[idx] = prompt
else:
# Fallback to original
original_scene = scene_data_list[idx] if idx < len(scene_data_list) else {}
result[idx] = original_scene.get(
"image_prompt", original_scene.get("description", "")
)
# Fill in any missing scenes with original prompts
for idx in range(len(scene_data_list)):
if idx not in result:
original_scene = scene_data_list[idx]
result[idx] = original_scene.get(
"image_prompt", original_scene.get("description", "")
)
logger.info(
f"[YouTubeSceneBuilder] ✅ Batch enhanced {len(result)} prompts "
f"in 1 AI call"
)
return result
except Exception as e:
logger.error(
f"[YouTubeSceneBuilder] Batch enhancement failed: {e}",
exc_info=True
)
# Return original prompts as fallback
return {
idx: scene.get("image_prompt", scene.get("description", ""))
for idx, scene in enumerate(scene_data_list)
}