Base code

This commit is contained in:
Kunthawat Greethong
2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
"""YouTube Creator Studio services."""

View File

@@ -0,0 +1,853 @@
"""
YouTube Video Planner Service
Generates video plans, outlines, and insights using AI with persona integration.
Supports optional Exa research for enhanced, data-driven plans.
"""
from typing import Dict, Any, Optional, List
from loguru import logger
from fastapi import HTTPException
import os
from services.llm_providers.main_text_generation import llm_text_gen
from utils.logger_utils import get_service_logger
logger = get_service_logger("youtube.planner")
# Video type configurations for optimization
VIDEO_TYPE_CONFIGS = {
"tutorial": {
"hook_strategy": "Problem statement or quick preview of solution",
"structure": "Problem → Steps → Result → Key Takeaways",
"visual_style": "Clean, instructional, screen-recordings or clear demonstrations",
"tone": "Clear, patient, instructional",
"optimal_scenes": "2-6 scenes showing sequential steps",
"avatar_style": "Approachable instructor, professional yet friendly",
"cta_focus": "Subscribe for more tutorials, try it yourself"
},
"review": {
"hook_strategy": "Product reveal or strong opinion statement",
"structure": "Hook → Overview → Pros/Cons → Verdict → CTA",
"visual_style": "Product-focused, close-ups, comparison shots",
"tone": "Honest, engaging, opinionated but fair",
"optimal_scenes": "4-8 scenes covering different aspects",
"avatar_style": "Trustworthy reviewer, confident, credible",
"cta_focus": "Check links in description, subscribe for reviews"
},
"educational": {
"hook_strategy": "Intriguing question or surprising fact",
"structure": "Question → Explanation → Examples → Conclusion",
"visual_style": "Illustrative, concept visualization, animations",
"tone": "Authoritative yet accessible, engaging",
"optimal_scenes": "3-10 scenes breaking down concepts",
"avatar_style": "Knowledgeable educator, professional, warm",
"cta_focus": "Learn more, subscribe for educational content"
},
"entertainment": {
"hook_strategy": "Grab attention immediately with energy/humor",
"structure": "Hook → Setup → Payoff → Share/Subscribe",
"visual_style": "Dynamic, energetic, varied angles, transitions",
"tone": "High energy, funny, engaging, personality-driven",
"optimal_scenes": "3-8 scenes with varied pacing",
"avatar_style": "Energetic creator, expressive, relatable",
"cta_focus": "Like, share, subscribe for more fun content"
},
"vlog": {
"hook_strategy": "Preview of day/event or personal moment",
"structure": "Introduction → Journey/Experience → Reflection → CTA",
"visual_style": "Natural, personal, authentic moments",
"tone": "Conversational, authentic, relatable",
"optimal_scenes": "5-15 scenes following narrative",
"avatar_style": "Authentic person, approachable, real",
"cta_focus": "Follow my journey, subscribe for daily updates"
},
"product_demo": {
"hook_strategy": "Product benefit or transformation",
"structure": "Benefit → Features → Use Cases → CTA",
"visual_style": "Product-focused, polished, commercial quality",
"tone": "Enthusiastic, persuasive, benefit-focused",
"optimal_scenes": "3-7 scenes highlighting features",
"avatar_style": "Professional presenter, polished, confident",
"cta_focus": "Get it now, learn more, special offer"
},
"reaction": {
"hook_strategy": "Preview of reaction or content being reacted to",
"structure": "Setup → Reaction → Commentary → CTA",
"visual_style": "Split-screen or picture-in-picture, expressive",
"tone": "Authentic reactions, engaging commentary",
"optimal_scenes": "4-10 scenes with reactions",
"avatar_style": "Expressive creator, authentic reactions",
"cta_focus": "Watch full video, subscribe for reactions"
},
"storytelling": {
"hook_strategy": "Intriguing opening or compelling question",
"structure": "Hook → Setup → Conflict → Resolution → CTA",
"visual_style": "Cinematic, narrative-driven, emotional",
"tone": "Engaging, immersive, story-focused",
"optimal_scenes": "6-15 scenes following narrative arc",
"avatar_style": "Storyteller, warm, engaging narrator",
"cta_focus": "Subscribe for more stories, share your thoughts"
}
}
class YouTubePlannerService:
"""Service for planning YouTube videos with AI assistance."""
def __init__(self):
"""Initialize the planner service."""
logger.info("[YouTubePlanner] Service initialized")
async def generate_video_plan(
self,
user_idea: str,
duration_type: str, # "shorts", "medium", "long"
video_type: Optional[str] = None, # "tutorial", "review", etc.
target_audience: Optional[str] = None,
video_goal: Optional[str] = None,
brand_style: Optional[str] = None,
persona_data: Optional[Dict[str, Any]] = None,
reference_image_description: Optional[str] = None,
source_content_id: Optional[str] = None, # For blog/story conversion
source_content_type: Optional[str] = None, # "blog", "story"
user_id: str = None,
include_scenes: bool = False, # For shorts: combine plan + scenes in one call
enable_research: bool = True, # Always enable research by default for enhanced plans
) -> Dict[str, Any]:
"""
Generate a comprehensive video plan from user input.
Args:
user_idea: User's video idea or topic
duration_type: "shorts" (≤60s), "medium" (1-4min), "long" (4-10min)
video_type: Optional video format type (tutorial, review, etc.)
target_audience: Optional target audience description
video_goal: Optional primary goal of the video
brand_style: Optional brand aesthetic preferences
persona_data: Optional persona data for tone/style
reference_image_description: Optional description of reference image
source_content_id: Optional ID of source content (blog/story)
source_content_type: Type of source content
user_id: Clerk user ID for subscription checking
Returns:
Dictionary with video plan, outline, insights, and metadata
"""
try:
logger.info(
f"[YouTubePlanner] Generating plan: idea={user_idea[:50]}..., "
f"duration={duration_type}, video_type={video_type}, user={user_id}"
)
# Get video type config
video_type_config = {}
if video_type and video_type in VIDEO_TYPE_CONFIGS:
video_type_config = VIDEO_TYPE_CONFIGS[video_type]
# Build persona context
persona_context = self._build_persona_context(persona_data)
# Build duration context
duration_context = self._get_duration_context(duration_type)
# Build source content context if provided
source_context = ""
if source_content_id and source_content_type:
source_context = f"""
**Source Content:**
- Type: {source_content_type}
- ID: {source_content_id}
- Note: This video should be based on the existing {source_content_type} content.
"""
# Build reference image context
image_context = ""
if reference_image_description:
image_context = f"""
**Reference Image:**
{reference_image_description}
- Use this as visual inspiration for the video
"""
# Generate smart defaults based on video type if selected
# When video_type is selected, use its config for defaults; otherwise use user inputs or generic defaults
if video_type_config:
default_tone = video_type_config.get('tone', 'Professional and engaging')
default_visual_style = video_type_config.get('visual_style', 'Professional and engaging')
default_goal = video_goal or f"Create engaging {video_type} content"
default_audience = target_audience or f"Viewers interested in {video_type} content"
else:
# No video type selected - use user inputs or generic defaults
default_tone = 'Professional and engaging'
default_visual_style = 'Professional and engaging'
default_goal = video_goal or 'Engage and inform viewers'
default_audience = target_audience or 'General YouTube audience'
# Perform Exa research if enabled (after defaults are set)
research_context = ""
research_sources = []
research_enabled = False
if enable_research:
logger.info(f"[YouTubePlanner] 🔍 Starting Exa research for plan generation (idea: {user_idea[:50]}...)")
research_enabled = True
try:
research_context, research_sources = await self._perform_exa_research(
user_idea=user_idea,
video_type=video_type,
target_audience=default_audience,
user_id=user_id
)
if research_sources:
logger.info(
f"[YouTubePlanner] ✅ Exa research completed successfully: "
f"{len(research_sources)} sources found. Research context length: {len(research_context)} chars"
)
else:
logger.warning(f"[YouTubePlanner] ⚠️ Exa research completed but no sources returned")
except HTTPException as http_ex:
# Subscription limit exceeded or other HTTP errors
error_detail = http_ex.detail
if isinstance(error_detail, dict):
error_msg = error_detail.get("message", error_detail.get("error", str(http_ex)))
else:
error_msg = str(error_detail)
logger.warning(
f"[YouTubePlanner] ⚠️ Exa research skipped due to subscription limits or error: {error_msg} "
f"(status={http_ex.status_code}). Continuing without research."
)
# Continue without research - non-critical failure
except Exception as e:
error_msg = str(e)
logger.warning(
f"[YouTubePlanner] ⚠️ Exa research failed (non-critical): {error_msg}. "
f"Continuing without research."
)
# Continue without research - non-critical failure
else:
logger.info(f"[YouTubePlanner] Exa research disabled for this plan generation")
# Generate comprehensive video plan
video_type_context = ""
if video_type_config:
video_type_context = f"""
**Video Type: {video_type}**
Follow these guidelines:
- Structure: {video_type_config.get('structure', '')}
- Hook: {video_type_config.get('hook_strategy', '')}
- Visual: {video_type_config.get('visual_style', '')}
- Tone: {video_type_config.get('tone', '')}
- CTA: {video_type_config.get('cta_focus', '')}
"""
planning_prompt = f"""Create a YouTube video plan for: "{user_idea}"
**Video Format:** {video_type or 'General'} | **Duration:** {duration_type} ({duration_context['target_seconds']}s target)
**Audience:** {default_audience}
**Goal:** {default_goal}
**Style:** {brand_style or default_visual_style}
{video_type_context}
**Constraints:**
- Duration: {duration_context['target_seconds']}s (Hook: {duration_context['hook_seconds']}s, Main: {duration_context['main_seconds']}s, CTA: {duration_context['cta_seconds']}s)
- Max scenes: {duration_context['max_scenes']}
{persona_context if persona_data else ""}
{source_context if source_content_id else ""}
{image_context if reference_image_description else ""}
{research_context if research_context else ""}
**Generate a plan with:**
1. **Video Summary**: 2-3 sentences capturing the essence
2. **Target Audience**: {f"Match: {target_audience}" if target_audience else f"Infer from video idea and {video_type or 'content type'}"}
3. **Video Goal**: {f"Align with: {video_goal}" if video_goal else f"Infer appropriate goal for {video_type or 'this'} content"}
4. **Key Message**: Single memorable takeaway
5. **Hook Strategy**: Engaging opening for first {duration_context['hook_seconds']}s{f" ({video_type_config.get('hook_strategy', '')})" if video_type_config else ""}
6. **Content Outline**: 3-5 sections totaling {duration_context['target_seconds']}s{f" following: {video_type_config.get('structure', '')}" if video_type_config else ""}
7. **Call-to-Action**: Actionable CTA{f" ({video_type_config.get('cta_focus', '')})" if video_type_config else ""}
8. **Visual Style**: Match {brand_style or default_visual_style}
9. **Tone**: {default_tone}
10. **SEO Keywords**: 5-7 relevant terms based on video idea
11. **Avatar Recommendations**: {f"{video_type_config.get('avatar_style', '')} " if video_type_config else ""}matching audience and style
**Response Format (JSON):**
{{
"video_summary": "...",
"target_audience": "...",
"video_goal": "...",
"key_message": "...",
"hook_strategy": "...",
"content_outline": [
{{"section": "...", "description": "...", "duration_estimate": 30}},
{{"section": "...", "description": "...", "duration_estimate": 45}}
],
"call_to_action": "...",
"visual_style": "...",
"tone": "...",
"seo_keywords": ["keyword1", "keyword2", ...],
"avatar_recommendations": {{
"description": "...",
"style": "...",
"energy": "..."
}}
}}
**Critical:** Content outline durations must sum to {duration_context['target_seconds']}s (±20%).
"""
system_prompt = (
"You are an expert YouTube content strategist. Create clear, actionable video plans "
"that are optimized for the specified video type and audience. Focus on accuracy and "
"specificity - these plans will be used to generate actual video content."
)
# For shorts, combine plan + scenes in one call to save API calls
if include_scenes and duration_type == "shorts":
planning_prompt += f"""
**IMPORTANT: Since this is a SHORTS video, also generate the complete scene breakdown in the same response.**
**Additional Task - Generate Detailed Scenes:**
Create detailed scenes (up to {duration_context['max_scenes']} scenes) that include:
1. Scene number and title
2. Narration text (what will be spoken) - keep it concise for shorts
3. Visual description (what viewers will see)
4. Duration estimate (2-8 seconds each)
5. Emphasis tags (hook, main_content, transition, cta)
**Scene Format:**
Each scene should be detailed enough for video generation. Total duration must fit within {duration_context['target_seconds']} seconds.
**Update JSON structure to include "scenes" array and "avatar_recommendations":**
Add a "scenes" field with the complete scene breakdown, and include "avatar_recommendations" with ideal presenter appearance, style, and energy.
"""
json_struct = {
"type": "object",
"properties": {
"video_summary": {"type": "string"},
"target_audience": {"type": "string"},
"video_goal": {"type": "string"},
"key_message": {"type": "string"},
"hook_strategy": {"type": "string"},
"content_outline": {
"type": "array",
"items": {
"type": "object",
"properties": {
"section": {"type": "string"},
"description": {"type": "string"},
"duration_estimate": {"type": "number"}
}
}
},
"call_to_action": {"type": "string"},
"visual_style": {"type": "string"},
"tone": {"type": "string"},
"seo_keywords": {
"type": "array",
"items": {"type": "string"}
},
"scenes": {
"type": "array",
"items": {
"type": "object",
"properties": {
"scene_number": {"type": "number"},
"title": {"type": "string"},
"narration": {"type": "string"},
"visual_description": {"type": "string"},
"duration_estimate": {"type": "number"},
"emphasis": {"type": "string"},
"visual_cues": {
"type": "array",
"items": {"type": "string"}
}
},
"required": [
"scene_number", "title", "narration", "visual_description",
"duration_estimate", "emphasis"
]
}
},
"avatar_recommendations": {
"type": "object",
"properties": {
"description": {"type": "string"},
"style": {"type": "string"},
"energy": {"type": "string"}
}
}
},
"required": [
"video_summary", "target_audience", "video_goal", "key_message",
"hook_strategy", "content_outline", "call_to_action",
"visual_style", "tone", "seo_keywords", "scenes", "avatar_recommendations"
]
}
else:
json_struct = {
"type": "object",
"properties": {
"video_summary": {"type": "string"},
"target_audience": {"type": "string"},
"video_goal": {"type": "string"},
"key_message": {"type": "string"},
"hook_strategy": {"type": "string"},
"content_outline": {
"type": "array",
"items": {
"type": "object",
"properties": {
"section": {"type": "string"},
"description": {"type": "string"},
"duration_estimate": {"type": "number"}
}
}
},
"call_to_action": {"type": "string"},
"visual_style": {"type": "string"},
"tone": {"type": "string"},
"seo_keywords": {
"type": "array",
"items": {"type": "string"}
},
"avatar_recommendations": {
"type": "object",
"properties": {
"description": {"type": "string"},
"style": {"type": "string"},
"energy": {"type": "string"}
}
}
},
"required": [
"video_summary", "target_audience", "video_goal", "key_message",
"hook_strategy", "content_outline", "call_to_action",
"visual_style", "tone", "seo_keywords", "avatar_recommendations"
]
}
# Generate plan using LLM with structured JSON response
# llm_text_gen handles subscription checks and provider selection automatically
# json_struct ensures deterministic structured response (returns dict, not string)
response = llm_text_gen(
prompt=planning_prompt,
system_prompt=system_prompt,
user_id=user_id,
json_struct=json_struct
)
# Parse response (structured responses return dict, text responses return string)
if isinstance(response, dict):
plan_data = response
else:
import json
try:
plan_data = json.loads(response)
except json.JSONDecodeError as e:
logger.error(f"[YouTubePlanner] Failed to parse JSON response: {e}")
logger.debug(f"[YouTubePlanner] Raw response: {response[:500]}")
raise HTTPException(
status_code=500,
detail="Failed to parse video plan response. Please try again."
)
# Validate and enhance plan quality
plan_data = self._validate_and_enhance_plan(
plan_data, duration_context, video_type, video_type_config
)
# Add metadata
plan_data["duration_type"] = duration_type
plan_data["duration_metadata"] = duration_context
plan_data["user_idea"] = user_idea
# Add research metadata to plan
plan_data["research_enabled"] = research_enabled
if research_sources:
plan_data["research_sources"] = research_sources
plan_data["research_sources_count"] = len(research_sources)
else:
plan_data["research_sources"] = []
plan_data["research_sources_count"] = 0
# Log research status in plan metadata for debugging
if research_enabled:
logger.info(
f"[YouTubePlanner] 📊 Plan metadata: research_enabled=True, "
f"research_sources_count={plan_data.get('research_sources_count', 0)}, "
f"research_context_length={len(research_context)} chars"
)
# Validate and process scenes if included (for shorts)
if include_scenes and duration_type == "shorts":
if "scenes" in plan_data and plan_data["scenes"]:
# Validate scenes count and duration
scenes = plan_data["scenes"]
scene_count = len(scenes)
total_scene_duration = sum(
scene.get("duration_estimate", 0) for scene in scenes
)
max_scenes = duration_context["max_scenes"]
target_duration = duration_context["target_seconds"]
if scene_count > max_scenes:
logger.warning(
f"[YouTubePlanner] Scene count ({scene_count}) exceeds max ({max_scenes}). "
f"Truncating to first {max_scenes} scenes."
)
plan_data["scenes"] = scenes[:max_scenes]
# Warn if total duration is off
if abs(total_scene_duration - target_duration) > target_duration * 0.3:
logger.warning(
f"[YouTubePlanner] Total scene duration ({total_scene_duration}s) "
f"differs significantly from target ({target_duration}s)"
)
plan_data["_scenes_included"] = True
logger.info(
f"[YouTubePlanner] ✅ Plan + {len(plan_data['scenes'])} scenes "
f"generated in 1 AI call (optimized for shorts)"
)
else:
# LLM did not return scenes; downstream will regenerate
plan_data["_scenes_included"] = False
logger.warning(
"[YouTubePlanner] Shorts optimization requested but no scenes returned; "
"scene builder will generate scenes separately."
)
logger.info(f"[YouTubePlanner] ✅ Plan generated successfully")
return plan_data
except HTTPException:
raise
except Exception as e:
logger.error(f"[YouTubePlanner] Error generating plan: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Failed to generate video plan: {str(e)}"
)
def _build_persona_context(self, persona_data: Optional[Dict[str, Any]]) -> str:
"""Build persona context string for prompts."""
if not persona_data:
return """
**Persona Context:**
- Using default professional tone
- No specific persona constraints
"""
core_persona = persona_data.get("core_persona", {})
tone = core_persona.get("tone", "professional")
voice = core_persona.get("voice_characteristics", {})
return f"""
**Persona Context:**
- Tone: {tone}
- Voice Style: {voice.get('style', 'professional')}
- Communication Style: {voice.get('communication_style', 'clear and direct')}
- Brand Values: {core_persona.get('core_belief', 'value-driven content')}
- Use this persona to guide the video's tone, style, and messaging approach.
"""
def _get_duration_context(self, duration_type: str) -> Dict[str, Any]:
"""Get duration-specific context and constraints."""
contexts = {
"shorts": {
"description": "YouTube Shorts (15-60 seconds)",
"target_seconds": 30,
"hook_seconds": 3,
"main_seconds": 24,
"cta_seconds": 3,
# Keep scenes tight for shorts to control cost and pacing
"max_scenes": 4,
"scene_duration_range": (2, 8)
},
"medium": {
"description": "Medium-length video (1-4 minutes)",
"target_seconds": 150, # 2.5 minutes
"hook_seconds": 10,
"main_seconds": 130,
"cta_seconds": 10,
"max_scenes": 12,
"scene_duration_range": (5, 15)
},
"long": {
"description": "Long-form video (4-10 minutes)",
"target_seconds": 420, # 7 minutes
"hook_seconds": 15,
"main_seconds": 380,
"cta_seconds": 25,
"max_scenes": 20,
"scene_duration_range": (10, 30)
}
}
return contexts.get(duration_type, contexts["medium"])
def _validate_and_enhance_plan(
self,
plan_data: Dict[str, Any],
duration_context: Dict[str, Any],
video_type: Optional[str],
video_type_config: Dict[str, Any],
) -> Dict[str, Any]:
"""
Validate and enhance plan quality before returning.
Performs quality checks:
- Validates required fields
- Validates content outline duration matches target
- Ensures SEO keywords are present
- Validates avatar recommendations
- Adds quality metadata
"""
# Ensure required fields exist
required_fields = [
"video_summary", "target_audience", "video_goal", "key_message",
"hook_strategy", "content_outline", "call_to_action",
"visual_style", "tone", "seo_keywords"
]
missing_fields = [field for field in required_fields if not plan_data.get(field)]
if missing_fields:
logger.warning(f"[YouTubePlanner] Missing required fields: {missing_fields}")
# Fill with defaults to prevent errors
for field in missing_fields:
if field == "seo_keywords":
plan_data[field] = []
elif field == "content_outline":
plan_data[field] = []
else:
plan_data[field] = f"[{field} not generated]"
# Validate content outline duration
if plan_data.get("content_outline"):
total_duration = sum(
section.get("duration_estimate", 0)
for section in plan_data["content_outline"]
)
target_duration = duration_context.get("target_seconds", 150)
# Allow 20% variance
tolerance = target_duration * 0.2
if abs(total_duration - target_duration) > tolerance:
logger.warning(
f"[YouTubePlanner] Content outline duration ({total_duration}s) "
f"doesn't match target ({target_duration}s). Adjusting..."
)
# Normalize durations proportionally
if total_duration > 0:
scale_factor = target_duration / total_duration
for section in plan_data["content_outline"]:
if "duration_estimate" in section:
section["duration_estimate"] = round(
section["duration_estimate"] * scale_factor, 1
)
# Validate SEO keywords
if not plan_data.get("seo_keywords") or len(plan_data["seo_keywords"]) < 3:
logger.warning(
f"[YouTubePlanner] Insufficient SEO keywords ({len(plan_data.get('seo_keywords', []))}). "
f"Plan may need enhancement."
)
# Validate avatar recommendations
if not plan_data.get("avatar_recommendations"):
logger.warning("[YouTubePlanner] Avatar recommendations missing. Generating defaults...")
plan_data["avatar_recommendations"] = {
"description": video_type_config.get("avatar_style", "Professional YouTube creator"),
"style": plan_data.get("visual_style", "Professional"),
"energy": plan_data.get("tone", "Engaging")
}
else:
# Ensure all avatar recommendation fields exist
avatar_rec = plan_data["avatar_recommendations"]
if not avatar_rec.get("description"):
avatar_rec["description"] = video_type_config.get("avatar_style", "Professional YouTube creator")
if not avatar_rec.get("style"):
avatar_rec["style"] = plan_data.get("visual_style", "Professional")
if not avatar_rec.get("energy"):
avatar_rec["energy"] = plan_data.get("tone", "Engaging")
# Add quality metadata
plan_data["_quality_checks"] = {
"content_outline_validated": bool(plan_data.get("content_outline")),
"seo_keywords_count": len(plan_data.get("seo_keywords", [])),
"avatar_recommendations_present": bool(plan_data.get("avatar_recommendations")),
"all_required_fields_present": len(missing_fields) == 0,
}
logger.info(
f"[YouTubePlanner] Plan quality validated: "
f"outline_sections={len(plan_data.get('content_outline', []))}, "
f"seo_keywords={len(plan_data.get('seo_keywords', []))}, "
f"avatar_recs={'yes' if plan_data.get('avatar_recommendations') else 'no'}"
)
return plan_data
async def _perform_exa_research(
self,
user_idea: str,
video_type: Optional[str],
target_audience: str,
user_id: str
) -> tuple[str, List[Dict[str, Any]]]:
"""
Perform Exa research directly using ExaResearchProvider (common module).
Uses the same pattern as podcast research with proper subscription checks.
Returns:
Tuple of (research_context_string, research_sources_list)
"""
try:
# Pre-flight validation for Exa search only (not full blog writer workflow)
# We only need to validate Exa API calls, not LLM operations
from services.database import get_db
from services.subscription import PricingService
from models.subscription_models import APIProvider
db = next(get_db())
try:
pricing_service = PricingService(db)
# Only validate Exa API call, not the full research workflow
operations_to_validate = [
{
'provider': APIProvider.EXA,
'tokens_requested': 0,
'actual_provider_name': 'exa',
'operation_type': 'exa_neural_search'
}
]
can_proceed, message, error_details = pricing_service.check_comprehensive_limits(
user_id=user_id,
operations=operations_to_validate
)
if not can_proceed:
usage_info = error_details.get('usage_info', {}) if error_details else {}
logger.warning(
f"[YouTubePlanner] Exa search blocked for user {user_id}: {message}"
)
raise HTTPException(
status_code=429,
detail={
'error': message,
'message': message,
'provider': 'exa',
'usage_info': usage_info if usage_info else error_details
}
)
logger.info(f"[YouTubePlanner] Exa search pre-flight validation passed for user {user_id}")
except HTTPException:
raise
except Exception as e:
logger.warning(f"[YouTubePlanner] Exa search pre-flight validation failed: {e}")
raise
finally:
db.close()
# Use ExaResearchProvider directly (common module, same as podcast)
from services.blog_writer.research.exa_provider import ExaResearchProvider
from types import SimpleNamespace
# Build research query
query_parts = [user_idea]
if video_type:
query_parts.append(f"{video_type} video")
if target_audience and target_audience != "General YouTube audience":
query_parts.append(target_audience)
research_query = " ".join(query_parts)
# Configure Exa research (same pattern as podcast)
cfg = SimpleNamespace(
exa_search_type="neural",
exa_category="web", # Focus on web content for YouTube
exa_include_domains=[],
exa_exclude_domains=[],
max_sources=10, # Limit sources for cost efficiency
source_types=[],
)
# Perform research
provider = ExaResearchProvider()
result = await provider.search(
prompt=research_query,
topic=user_idea,
industry="",
target_audience=target_audience,
config=cfg,
user_id=user_id,
)
# Track usage
cost_total = 0.0
if isinstance(result, dict):
cost_total = result.get("cost", {}).get("total", 0.005) if result.get("cost") else 0.005
provider.track_exa_usage(user_id, cost_total)
# Extract sources and content
sources = result.get("sources", []) or []
research_content = result.get("content", "")
# Build research context for prompt
research_context = ""
if research_content and sources:
# Limit content to 2000 chars to avoid token bloat
limited_content = research_content[:2000]
research_context = f"""
**Research & Current Information:**
Based on current web research, here are relevant insights and trends:
{limited_content}
**Key Research Sources ({len(sources)} sources):**
"""
# Add top 5 sources for context
for idx, source in enumerate(sources[:5], 1):
title = source.get("title", "Untitled") or "Untitled"
url = source.get("url", "") or ""
excerpt = (source.get("excerpt", "") or "")[:200]
if not excerpt:
excerpt = (source.get("summary", "") or "")[:200]
research_context += f"\n{idx}. {title}\n {excerpt}\n Source: {url}\n"
research_context += "\n**Use this research to:**\n"
research_context += "- Identify current trends and popular angles\n"
research_context += "- Enhance SEO keywords with real search data\n"
research_context += "- Ensure content is relevant and up-to-date\n"
research_context += "- Reference credible sources in the plan\n"
research_context += "- Identify gaps or unique angles not covered by competitors\n"
# Format sources for response
formatted_sources = []
for source in sources:
formatted_sources.append({
"title": source.get("title", "") or "",
"url": source.get("url", "") or "",
"excerpt": (source.get("excerpt", "") or "")[:300],
"published_at": source.get("published_at"),
"credibility_score": source.get("credibility_score", 0.85) or 0.85,
})
logger.info(f"[YouTubePlanner] Exa research completed: {len(formatted_sources)} sources found")
return research_context, formatted_sources
except HTTPException:
# Re-raise HTTPException (subscription limits, etc.)
raise
except Exception as e:
logger.error(f"[YouTubePlanner] Research error: {e}", exc_info=True)
# Non-critical failure - return empty research
return "", []

View File

@@ -0,0 +1,573 @@
"""
YouTube Video Renderer Service
Handles video rendering using WAN 2.5 text-to-video and audio generation.
"""
from typing import Dict, Any, List, Optional
from pathlib import Path
import base64
import uuid
import requests
from loguru import logger
from fastapi import HTTPException
from services.wavespeed.client import WaveSpeedClient
from services.llm_providers.main_audio_generation import generate_audio
from services.story_writer.video_generation_service import StoryVideoGenerationService
from services.subscription import PricingService
from services.subscription.preflight_validator import validate_scene_animation_operation
from services.llm_providers.main_video_generation import track_video_usage
from utils.logger_utils import get_service_logger
from utils.asset_tracker import save_asset_to_library
logger = get_service_logger("youtube.renderer")
class YouTubeVideoRendererService:
"""Service for rendering YouTube videos from scenes."""
def __init__(self):
"""Initialize the renderer service."""
self.wavespeed_client = WaveSpeedClient()
# Video output directory
base_dir = Path(__file__).parent.parent.parent.parent
self.output_dir = base_dir / "youtube_videos"
self.output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"[YouTubeRenderer] Initialized with output directory: {self.output_dir}")
def render_scene_video(
self,
scene: Dict[str, Any],
video_plan: Dict[str, Any],
user_id: str,
resolution: str = "720p",
generate_audio_enabled: bool = True,
voice_id: str = "Wise_Woman",
) -> Dict[str, Any]:
"""
Render a single scene into a video.
Args:
scene: Scene data with narration and visual prompts
video_plan: Original video plan for context
user_id: Clerk user ID
resolution: Video resolution (480p, 720p, 1080p)
generate_audio: Whether to generate narration audio
voice_id: Voice ID for audio generation
Returns:
Dictionary with video metadata, bytes, and cost
"""
try:
scene_number = scene.get("scene_number", 1)
narration = scene.get("narration", "").strip()
visual_prompt = (scene.get("enhanced_visual_prompt") or scene.get("visual_prompt", "")).strip()
duration_estimate = scene.get("duration_estimate", 5)
# VALIDATION: Check inputs before making expensive API calls
if not visual_prompt:
raise HTTPException(
status_code=400,
detail={
"error": f"Scene {scene_number} has no visual prompt",
"scene_number": scene_number,
"message": "Visual prompt is required for video generation",
"user_action": "Please add a visual description for this scene before rendering.",
}
)
if len(visual_prompt) < 10:
logger.warning(
f"[YouTubeRenderer] Scene {scene_number} has very short visual prompt "
f"({len(visual_prompt)} chars), may result in poor quality"
)
# Clamp duration to valid WAN 2.5 values (5 or 10 seconds)
duration = 5 if duration_estimate <= 7 else 10
# Log asset usage status
has_existing_image = bool(scene.get("imageUrl"))
has_existing_audio = bool(scene.get("audioUrl"))
logger.info(
f"[YouTubeRenderer] Rendering scene {scene_number}: "
f"resolution={resolution}, duration={duration}s, prompt_length={len(visual_prompt)}, "
f"has_existing_image={has_existing_image}, has_existing_audio={has_existing_audio}"
)
# Use existing audio if available, otherwise generate if requested
audio_base64 = None
scene_audio_url = scene.get("audioUrl")
if scene_audio_url:
# Load existing audio from URL
try:
from pathlib import Path
from urllib.parse import urlparse
import requests
logger.info(f"[YouTubeRenderer] Attempting to load existing audio for scene {scene_number} from URL: {scene_audio_url}")
# Extract filename from URL (e.g., /api/youtube/audio/filename.mp3)
parsed_url = urlparse(scene_audio_url)
audio_filename = Path(parsed_url.path).name
# Try to load from local file system first
base_dir = Path(__file__).parent.parent.parent.parent
youtube_audio_dir = base_dir / "youtube_audio"
audio_path = youtube_audio_dir / audio_filename
# Debug: If file not found, try to find it with flexible matching
if not audio_path.exists():
logger.debug(f"[YouTubeRenderer] Audio file not found at {audio_path}. Searching for alternative matches...")
if youtube_audio_dir.exists():
all_files = list(youtube_audio_dir.glob("*.mp3"))
logger.debug(f"[YouTubeRenderer] Found {len(all_files)} MP3 files in directory")
# Try to find a file that matches the scene (by scene number or title pattern)
# The filename format is: scene_{scene_number}_{clean_title}_{unique_id}.mp3
# Extract components from expected filename
expected_parts = audio_filename.replace('.mp3', '').split('_')
if len(expected_parts) >= 3:
scene_num_str = expected_parts[1] if expected_parts[0] == 'scene' else None
title_part = expected_parts[2] if len(expected_parts) > 2 else None
# Try to find files matching scene number or title
matching_files = []
for f in all_files:
file_parts = f.stem.split('_')
if len(file_parts) >= 3 and file_parts[0] == 'scene':
file_scene_num = file_parts[1]
file_title = file_parts[2] if len(file_parts) > 2 else ''
# Match by scene number (try both 0-indexed and 1-indexed)
if scene_num_str:
scene_num_int = int(scene_num_str)
file_scene_int = int(file_scene_num) if file_scene_num.isdigit() else None
if file_scene_int == scene_num_int or file_scene_int == scene_num_int - 1 or file_scene_int == scene_num_int + 1:
matching_files.append(f.name)
# Or match by title
elif title_part and title_part.lower() in file_title.lower():
matching_files.append(f.name)
if matching_files:
logger.info(
f"[YouTubeRenderer] Found potential audio file matches for scene {scene_number}: {matching_files[:3]}. "
f"Expected: {audio_filename}"
)
# Try using the first match
alternative_path = youtube_audio_dir / matching_files[0]
if alternative_path.exists() and alternative_path.is_file():
logger.info(f"[YouTubeRenderer] Using alternative audio file: {matching_files[0]}")
audio_path = alternative_path
audio_filename = matching_files[0]
else:
logger.warning(f"[YouTubeRenderer] Alternative match found but file doesn't exist: {alternative_path}")
else:
# Show sample files for debugging
sample_files = [f.name for f in all_files[:10] if f.name.startswith("scene_")]
if sample_files:
logger.debug(f"[YouTubeRenderer] Sample scene audio files in directory: {sample_files}")
if audio_path.exists() and audio_path.is_file():
with open(audio_path, "rb") as f:
audio_bytes = f.read()
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
logger.info(f"[YouTubeRenderer] ✅ Using existing audio for scene {scene_number} from local file: {audio_filename} ({len(audio_bytes)} bytes)")
else:
# File not found locally - try loading from asset library
logger.warning(
f"[YouTubeRenderer] Audio file not found locally at {audio_path}. "
f"Attempting to load from asset library (filename: {audio_filename})"
)
try:
from services.content_asset_service import ContentAssetService
from services.database import get_db
from models.content_asset_models import AssetType, AssetSource
db = next(get_db())
try:
asset_service = ContentAssetService(db)
# Try to find the asset by filename and source
assets = asset_service.get_assets(
user_id=user_id,
asset_type=AssetType.AUDIO,
source_module=AssetSource.YOUTUBE_CREATOR,
limit=100,
)
# Find matching asset by filename
matching_asset = None
for asset in assets:
if asset.filename == audio_filename:
matching_asset = asset
break
if matching_asset and matching_asset.file_path:
asset_path = Path(matching_asset.file_path)
if asset_path.exists() and asset_path.is_file():
with open(asset_path, "rb") as f:
audio_bytes = f.read()
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
logger.info(
f"[YouTubeRenderer] ✅ Loaded audio for scene {scene_number} from asset library: "
f"{audio_filename} ({len(audio_bytes)} bytes)"
)
else:
raise FileNotFoundError(f"Asset library file path does not exist: {asset_path}")
else:
raise FileNotFoundError(f"Audio asset not found in library for filename: {audio_filename}")
finally:
db.close()
except Exception as asset_error:
logger.warning(
f"[YouTubeRenderer] Failed to load audio from asset library: {asset_error}. "
f"Original path attempted: {audio_path}"
)
raise FileNotFoundError(
f"Audio file not found at {audio_path} and not found in asset library: {asset_error}"
)
except FileNotFoundError as e:
logger.warning(f"[YouTubeRenderer] ❌ Audio file not found: {e}. Will generate new audio if enabled.")
scene_audio_url = None # Fall back to generation
except Exception as e:
logger.warning(f"[YouTubeRenderer] ❌ Failed to load existing audio: {e}. Will generate new audio if enabled.", exc_info=True)
scene_audio_url = None # Fall back to generation
# Generate audio if not available and generation is enabled
if not audio_base64 and generate_audio_enabled and narration and len(narration.strip()) > 0:
try:
audio_result = generate_audio(
text=narration,
voice_id=voice_id,
user_id=user_id,
)
# generate_audio may return raw bytes or AudioGenerationResult
audio_bytes = audio_result.audio_bytes if hasattr(audio_result, "audio_bytes") else audio_result
# Convert to base64 (just the base64 string, not data URI)
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
logger.info(f"[YouTubeRenderer] Generated new audio for scene {scene_number}")
except Exception as e:
logger.warning(f"[YouTubeRenderer] Audio generation failed: {e}, continuing without audio")
# VALIDATION: Final check before expensive video API call
if not visual_prompt or len(visual_prompt.strip()) < 5:
raise HTTPException(
status_code=400,
detail={
"error": f"Scene {scene_number} has invalid visual prompt",
"scene_number": scene_number,
"message": "Visual prompt must be at least 5 characters",
"user_action": "Please provide a valid visual description for this scene.",
}
)
# Generate video using WAN 2.5 text-to-video
# This is the expensive API call - all validation should be done before this
# Use sync mode to wait for result directly (prevents timeout issues)
try:
video_result = self.wavespeed_client.generate_text_video(
prompt=visual_prompt,
resolution=resolution,
duration=duration,
audio_base64=audio_base64, # Optional: enables lip-sync if provided
enable_prompt_expansion=True,
enable_sync_mode=True, # Use sync mode to wait for result directly
timeout=600, # Increased timeout for sync mode (10 minutes)
)
except requests.exceptions.Timeout as e:
logger.error(f"[YouTubeRenderer] WaveSpeed API timed out for scene {scene_number}: {e}")
raise HTTPException(
status_code=504,
detail={
"error": "WaveSpeed request timed out",
"scene_number": scene_number,
"message": "The video generation request timed out.",
"user_action": "Please retry. If it persists, try fewer scenes, lower resolution, or shorter durations.",
},
) from e
except requests.exceptions.RequestException as e:
logger.error(f"[YouTubeRenderer] WaveSpeed API request failed for scene {scene_number}: {e}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed request failed",
"scene_number": scene_number,
"message": str(e),
"user_action": "Please retry. If it persists, check network connectivity or try again later.",
},
) from e
# Save scene video
video_service = StoryVideoGenerationService(output_dir=str(self.output_dir))
save_result = video_service.save_scene_video(
video_bytes=video_result["video_bytes"],
scene_number=scene_number,
user_id=user_id,
)
# Update video URL to use YouTube API endpoint
filename = save_result["video_filename"]
save_result["video_url"] = f"/api/youtube/videos/{filename}"
# Track usage
usage_info = track_video_usage(
user_id=user_id,
provider=video_result["provider"],
model_name=video_result["model_name"],
prompt=visual_prompt,
video_bytes=video_result["video_bytes"],
cost_override=video_result["cost"],
)
logger.info(
f"[YouTubeRenderer] ✅ Scene {scene_number} rendered: "
f"cost=${video_result['cost']:.2f}, size={len(video_result['video_bytes'])} bytes"
)
return {
"scene_number": scene_number,
"video_filename": save_result["video_filename"],
"video_url": save_result["video_url"],
"video_path": save_result["video_path"],
"duration": video_result["duration"],
"cost": video_result["cost"],
"resolution": resolution,
"width": video_result["width"],
"height": video_result["height"],
"file_size": save_result["file_size"],
"prediction_id": video_result.get("prediction_id"),
"usage_info": usage_info,
}
except HTTPException as e:
# Re-raise with better error message for UI
error_detail = e.detail
if isinstance(error_detail, dict):
error_msg = error_detail.get("error", str(error_detail))
else:
error_msg = str(error_detail)
logger.error(
f"[YouTubeRenderer] Scene {scene_number} failed: {error_msg}",
exc_info=True
)
raise HTTPException(
status_code=e.status_code,
detail={
"error": f"Failed to render scene {scene_number}",
"scene_number": scene_number,
"message": error_msg,
"user_action": "Please try again. If the issue persists, check your scene content and try a different resolution.",
}
)
except Exception as e:
logger.error(f"[YouTubeRenderer] Error rendering scene {scene_number}: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail={
"error": f"Failed to render scene {scene_number}",
"scene_number": scene_number,
"message": str(e),
"user_action": "Please try again. If the issue persists, check your scene content and try a different resolution.",
}
)
def render_full_video(
self,
scenes: List[Dict[str, Any]],
video_plan: Dict[str, Any],
user_id: str,
resolution: str = "720p",
combine_scenes: bool = True,
voice_id: str = "Wise_Woman",
) -> Dict[str, Any]:
"""
Render a complete video from multiple scenes.
Args:
scenes: List of scene data
video_plan: Original video plan
user_id: Clerk user ID
resolution: Video resolution
combine_scenes: Whether to combine scenes into single video
voice_id: Voice ID for narration
Returns:
Dictionary with video metadata and scene results
"""
try:
logger.info(
f"[YouTubeRenderer] Rendering full video: {len(scenes)} scenes, "
f"resolution={resolution}, user={user_id}"
)
# Filter enabled scenes
enabled_scenes = [s for s in scenes if s.get("enabled", True)]
if not enabled_scenes:
raise HTTPException(status_code=400, detail="No enabled scenes to render")
scene_results = []
total_cost = 0.0
# Render each scene
for idx, scene in enumerate(enabled_scenes):
logger.info(
f"[YouTubeRenderer] Rendering scene {idx + 1}/{len(enabled_scenes)}: "
f"Scene {scene.get('scene_number', idx + 1)}"
)
scene_result = self.render_scene_video(
scene=scene,
video_plan=video_plan,
user_id=user_id,
resolution=resolution,
generate_audio_enabled=True,
voice_id=voice_id,
)
scene_results.append(scene_result)
total_cost += scene_result["cost"]
# Combine scenes if requested
final_video_path = None
final_video_url = None
if combine_scenes and len(scene_results) > 1:
logger.info("[YouTubeRenderer] Combining scenes into final video...")
# Prepare data for video concatenation
scene_video_paths = [r["video_path"] for r in scene_results]
scene_audio_paths = [r.get("audio_path") for r in scene_results if r.get("audio_path")]
# Use StoryVideoGenerationService to combine
video_service = StoryVideoGenerationService(output_dir=str(self.output_dir))
# Create scene dicts for concatenation
scene_dicts = [
{
"scene_number": r["scene_number"],
"title": f"Scene {r['scene_number']}",
}
for r in scene_results
]
combined_result = video_service.generate_story_video(
scenes=scene_dicts,
image_paths=[None] * len(scene_results), # No static images
audio_paths=scene_audio_paths if scene_audio_paths else [],
video_paths=scene_video_paths, # Use rendered videos
user_id=user_id,
story_title=video_plan.get("video_summary", "YouTube Video")[:50],
fps=24,
)
final_video_path = combined_result["video_path"]
final_video_url = combined_result["video_url"]
logger.info(
f"[YouTubeRenderer] ✅ Full video rendered: {len(scene_results)} scenes, "
f"total_cost=${total_cost:.2f}"
)
return {
"success": True,
"scene_results": scene_results,
"total_cost": total_cost,
"final_video_path": final_video_path,
"final_video_url": final_video_url,
"num_scenes": len(scene_results),
"resolution": resolution,
}
except HTTPException:
raise
except Exception as e:
logger.error(f"[YouTubeRenderer] Error rendering full video: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Failed to render video: {str(e)}"
)
def estimate_render_cost(
self,
scenes: List[Dict[str, Any]],
resolution: str = "720p",
image_model: str = "ideogram-v3-turbo",
) -> Dict[str, Any]:
"""
Estimate the cost of rendering a video before actually rendering it.
Args:
scenes: List of scene data with duration estimates
resolution: Video resolution (480p, 720p, 1080p)
Returns:
Dictionary with cost breakdown and total estimate
"""
# Pricing per second (same as in WaveSpeedClient)
pricing = {
"480p": 0.05,
"720p": 0.10,
"1080p": 0.15,
}
price_per_second = pricing.get(resolution, 0.10)
# Image generation pricing
image_pricing = {
"ideogram-v3-turbo": 0.10,
"qwen-image": 0.05,
}
image_cost_per_scene = image_pricing.get(image_model, 0.10)
# Filter enabled scenes
enabled_scenes = [s for s in scenes if s.get("enabled", True)]
scene_costs = []
total_cost = 0.0
total_duration = 0.0
total_image_cost = len(enabled_scenes) * image_cost_per_scene
for scene in enabled_scenes:
scene_number = scene.get("scene_number", 0)
duration_estimate = scene.get("duration_estimate", 5)
# Clamp duration to valid WAN 2.5 values (5 or 10 seconds)
duration = 5 if duration_estimate <= 7 else 10
scene_cost = price_per_second * duration
scene_costs.append({
"scene_number": scene_number,
"duration_estimate": duration_estimate,
"actual_duration": duration,
"cost": round(scene_cost, 2),
})
total_cost += scene_cost
total_duration += duration
# Add image costs to total
total_cost += total_image_cost
return {
"resolution": resolution,
"price_per_second": price_per_second,
"num_scenes": len(enabled_scenes),
"total_duration_seconds": total_duration,
"scene_costs": scene_costs,
"total_cost": round(total_cost, 2),
"estimated_cost_range": {
"min": round(total_cost * 0.9, 2), # 10% buffer
"max": round(total_cost * 1.1, 2), # 10% buffer
},
"image_model": image_model,
"image_cost_per_scene": image_cost_per_scene,
"total_image_cost": round(total_image_cost, 2),
}

View File

@@ -0,0 +1,598 @@
"""
YouTube Scene Builder Service
Converts video plans into structured scenes with narration, visual prompts, and timing.
"""
from typing import Dict, Any, Optional, List
from loguru import logger
from fastapi import HTTPException
from services.llm_providers.main_text_generation import llm_text_gen
from services.story_writer.prompt_enhancer_service import PromptEnhancerService
from utils.logger_utils import get_service_logger
logger = get_service_logger("youtube.scene_builder")
class YouTubeSceneBuilderService:
"""Service for building structured video scenes from plans."""
def __init__(self):
"""Initialize the scene builder service."""
self.prompt_enhancer = PromptEnhancerService()
logger.info("[YouTubeSceneBuilder] Service initialized")
def build_scenes_from_plan(
self,
video_plan: Dict[str, Any],
user_id: str,
custom_script: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Build structured scenes from a video plan.
This method is optimized to minimize AI calls:
- For shorts: Reuses scenes if already generated in plan (0 AI calls)
- For medium/long: Generates scenes + batch enhances (1-3 AI calls total)
- Custom script: Parses script without AI calls (0 AI calls)
Args:
video_plan: Video plan from planner service
user_id: Clerk user ID for subscription checking
custom_script: Optional custom script to use instead of generating
Returns:
List of scene dictionaries with narration, visual prompts, timing, etc.
"""
try:
duration_type = video_plan.get('duration_type', 'medium')
logger.info(
f"[YouTubeSceneBuilder] Building scenes from plan: "
f"duration={duration_type}, "
f"sections={len(video_plan.get('content_outline', []))}, "
f"user={user_id}"
)
duration_metadata = video_plan.get("duration_metadata", {})
max_scenes = duration_metadata.get("max_scenes", 10)
# Optimization: Check if scenes already exist in plan (prevents duplicate generation)
# This can happen if plan was generated with include_scenes=True for shorts
existing_scenes = video_plan.get("scenes", [])
if existing_scenes and video_plan.get("_scenes_included"):
# Scenes already generated in plan - reuse them (0 AI calls)
logger.info(
f"[YouTubeSceneBuilder] ♻️ Reusing {len(existing_scenes)} scenes from plan "
f"(duration={duration_type}) - skipping generation to save AI calls"
)
scenes = self._normalize_scenes_from_plan(video_plan, duration_metadata)
# If custom script provided, parse it into scenes (0 AI calls for parsing)
elif custom_script:
logger.info(
f"[YouTubeSceneBuilder] Parsing custom script for scene generation "
f"(0 AI calls required)"
)
scenes = self._parse_custom_script(
custom_script, video_plan, duration_metadata, user_id
)
# For shorts, check if scenes were already generated in plan (optimization)
elif video_plan.get("_scenes_included") and duration_type == "shorts":
prebuilt = video_plan.get("scenes") or []
if prebuilt:
logger.info(
f"[YouTubeSceneBuilder] Using scenes from optimized plan+scenes call "
f"({len(prebuilt)} scenes)"
)
scenes = self._normalize_scenes_from_plan(video_plan, duration_metadata)
else:
logger.warning(
"[YouTubeSceneBuilder] Plan marked _scenes_included but no scenes present; "
"regenerating scenes normally."
)
scenes = self._generate_scenes_from_plan(
video_plan, duration_metadata, user_id
)
else:
# Generate scenes from plan
scenes = self._generate_scenes_from_plan(
video_plan, duration_metadata, user_id
)
# Limit to max scenes
if len(scenes) > max_scenes:
logger.warning(
f"[YouTubeSceneBuilder] Truncating {len(scenes)} scenes to {max_scenes}"
)
scenes = scenes[:max_scenes]
# Enhance visual prompts efficiently based on duration type
duration_type = video_plan.get("duration_type", "medium")
scenes = self._enhance_visual_prompts_batch(
scenes, video_plan, user_id, duration_type
)
logger.info(f"[YouTubeSceneBuilder] ✅ Built {len(scenes)} scenes")
return scenes
except HTTPException:
raise
except Exception as e:
logger.error(f"[YouTubeSceneBuilder] Error building scenes: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail=f"Failed to build scenes: {str(e)}"
)
def _generate_scenes_from_plan(
self,
video_plan: Dict[str, Any],
duration_metadata: Dict[str, Any],
user_id: str,
) -> List[Dict[str, Any]]:
"""Generate scenes from video plan using AI."""
content_outline = video_plan.get("content_outline", [])
hook_strategy = video_plan.get("hook_strategy", "")
call_to_action = video_plan.get("call_to_action", "")
visual_style = video_plan.get("visual_style", "cinematic")
tone = video_plan.get("tone", "professional")
scene_duration_range = duration_metadata.get("scene_duration_range", (5, 15))
scene_generation_prompt = f"""You are a top YouTube scriptwriter specializing in engaging, viral content. Create compelling scenes that captivate viewers and maximize watch time.
**VIDEO PLAN:**
📝 Summary: {video_plan.get('video_summary', '')}
🎯 Goal: {video_plan.get('video_goal', '')}
💡 Key Message: {video_plan.get('key_message', '')}
🎨 Visual Style: {visual_style}
🎭 Tone: {tone}
**🎣 HOOK STRATEGY:**
{hook_strategy}
**📋 CONTENT STRUCTURE:**
{chr(10).join([f"{section.get('section', '')}: {section.get('description', '')} ({section.get('duration_estimate', 0)}s)" for section in content_outline])}
**🚀 CALL-TO-ACTION:**
{call_to_action}
**⏱️ TIMING CONSTRAINTS:**
• Scene duration: {scene_duration_range[0]}-{scene_duration_range[1]} seconds each
• Total target: {duration_metadata.get('target_seconds', 150)} seconds
**🎬 YOUR MISSION - CREATE VIRAL-WORTHY SCENES:**
Write narration that:
✨ **HOOKS IMMEDIATELY** - First {duration_metadata.get('hook_seconds', 10)}s must GRAB attention
🎭 **TELLS A STORY** - Each scene advances the narrative with emotional engagement
💡 **DELIVERS VALUE** - Provide insights, tips, or "aha!" moments in every scene
🔥 **BUILDS EXCITEMENT** - Use power words, questions, and cliffhangers
👥 **CONNECTS PERSONALLY** - Speak directly to the viewer's needs and desires
⚡ **MAINTAINS PACE** - Vary sentence length for natural rhythm
🎯 **DRIVES ACTION** - Build toward the CTA with increasing urgency
**REQUIRED SCENE ELEMENTS:**
1. **scene_number**: Sequential numbering
2. **title**: Catchy, descriptive title (5-8 words max)
3. **narration**: ENGAGING spoken script with:
- Conversational language ("you know what I mean?")
- Rhetorical questions ("Have you ever wondered...?")
- Power transitions ("But here's the game-changer...")
- Emotional hooks ("Imagine this...")
- Action-oriented language ("Let's dive in...")
4. **visual_description**: Cinematic, professional YouTube visuals
5. **duration_estimate**: Realistic speaking time
6. **emphasis**: hook/main_content/transition/cta
7. **visual_cues**: ["dramatic_zoom", "text_overlay", "fast_cuts"]
**🎯 YOUTUBE OPTIMIZATION RULES:**
• **Hook Power**: First 3 seconds = make them stay or lose them
• **Value Density**: Every 10 seconds must deliver new insight
• **Emotional Arc**: Build curiosity → teach → inspire → convert
• **Natural Flow**: Scenes must connect seamlessly
• **CTA Momentum**: Final scene creates irresistible urge to act
**📊 FORMAT AS JSON ARRAY:**
[
{{
"scene_number": 1,
"title": "The Shocking Truth They Hide",
"narration": "You won't believe what just happened in my latest discovery! I was scrolling through the usual content when BAM - this completely changed everything I thought about [topic]. And get this - it could transform YOUR results too!",
"visual_description": "Dynamic opening shot with shocking text overlay, fast cuts of social media feeds, energetic music swell, close-up of surprised reaction",
"duration_estimate": 8,
"emphasis": "hook",
"visual_cues": ["shocking_text", "fast_cuts", "music_swell", "reaction_shot"]
}},
...
]
**🔥 SUCCESS CRITERIA:**
✅ First scene hooks in 3 seconds
✅ Each scene delivers 1-2 key insights
✅ Narration feels like talking to a friend
✅ Total story arc creates emotional journey
✅ CTA feels like the natural next step
✅ Scenes fit duration perfectly"""
system_prompt = (
"You are a master YouTube scriptwriter who creates viral, engaging content that "
"keeps viewers watching until the end. You understand YouTube algorithm optimization, "
"emotional storytelling, and creating irresistible hooks that make viewers hit 'like' and 'subscribe'. "
"Your scripts are conversational, valuable, and conversion-focused."
)
response = llm_text_gen(
prompt=scene_generation_prompt,
system_prompt=system_prompt,
user_id=user_id,
json_struct={
"type": "array",
"items": {
"type": "object",
"properties": {
"scene_number": {"type": "number"},
"title": {"type": "string"},
"narration": {"type": "string"},
"visual_description": {"type": "string"},
"duration_estimate": {"type": "number"},
"emphasis": {"type": "string"},
"visual_cues": {
"type": "array",
"items": {"type": "string"}
}
},
"required": [
"scene_number", "title", "narration", "visual_description",
"duration_estimate", "emphasis"
]
}
}
)
# Parse response
if isinstance(response, list):
scenes = response
elif isinstance(response, dict) and "scenes" in response:
scenes = response["scenes"]
else:
import json
scenes = json.loads(response) if isinstance(response, str) else response
# Normalize scene data
normalized_scenes = []
for idx, scene in enumerate(scenes, 1):
normalized_scenes.append({
"scene_number": scene.get("scene_number", idx),
"title": scene.get("title", f"Scene {idx}"),
"narration": scene.get("narration", ""),
"visual_description": scene.get("visual_description", ""),
"duration_estimate": scene.get("duration_estimate", scene_duration_range[0]),
"emphasis": scene.get("emphasis", "main_content"),
"visual_cues": scene.get("visual_cues", []),
"visual_prompt": scene.get("visual_description", ""), # Initial prompt
})
return normalized_scenes
def _normalize_scenes_from_plan(
self,
video_plan: Dict[str, Any],
duration_metadata: Dict[str, Any],
) -> List[Dict[str, Any]]:
"""Normalize scenes that were generated as part of the plan (optimization for shorts)."""
scenes = video_plan.get("scenes", [])
scene_duration_range = duration_metadata.get("scene_duration_range", (2, 8))
normalized_scenes = []
for idx, scene in enumerate(scenes, 1):
normalized_scenes.append({
"scene_number": scene.get("scene_number", idx),
"title": scene.get("title", f"Scene {idx}"),
"narration": scene.get("narration", ""),
"visual_description": scene.get("visual_description", ""),
"duration_estimate": scene.get("duration_estimate", scene_duration_range[0]),
"emphasis": scene.get("emphasis", "main_content"),
"visual_cues": scene.get("visual_cues", []),
"visual_prompt": scene.get("visual_description", ""), # Initial prompt
})
logger.info(
f"[YouTubeSceneBuilder] ✅ Normalized {len(normalized_scenes)} scenes "
f"from optimized plan (saved 1 AI call)"
)
return normalized_scenes
def _parse_custom_script(
self,
custom_script: str,
video_plan: Dict[str, Any],
duration_metadata: Dict[str, Any],
user_id: str,
) -> List[Dict[str, Any]]:
"""Parse a custom script into structured scenes."""
# Simple parsing: split by double newlines or scene markers
import re
# Try to detect scene markers
scene_pattern = r'(?:Scene\s+\d+|#\s*\d+\.|^\d+\.)\s*(.+?)(?=(?:Scene\s+\d+|#\s*\d+\.|^\d+\.|$))'
matches = re.finditer(scene_pattern, custom_script, re.MULTILINE | re.DOTALL)
scenes = []
for idx, match in enumerate(matches, 1):
scene_text = match.group(1).strip()
# Extract narration (first paragraph or before visual markers)
narration_match = re.search(r'^(.*?)(?:\n\n|Visual:|Image:)', scene_text, re.DOTALL)
narration = narration_match.group(1).strip() if narration_match else scene_text.split('\n')[0]
# Extract visual description
visual_match = re.search(r'(?:Visual:|Image:)\s*(.+?)(?:\n\n|$)', scene_text, re.DOTALL)
visual_description = visual_match.group(1).strip() if visual_match else narration
scenes.append({
"scene_number": idx,
"title": f"Scene {idx}",
"narration": narration,
"visual_description": visual_description,
"duration_estimate": duration_metadata.get("scene_duration_range", [5, 15])[0],
"emphasis": "hook" if idx == 1 else ("cta" if idx == len(list(matches)) else "main_content"),
"visual_cues": [],
"visual_prompt": visual_description,
})
# Fallback: split by paragraphs if no scene markers
if not scenes:
paragraphs = [p.strip() for p in custom_script.split('\n\n') if p.strip()]
for idx, para in enumerate(paragraphs[:duration_metadata.get("max_scenes", 10)], 1):
scenes.append({
"scene_number": idx,
"title": f"Scene {idx}",
"narration": para,
"visual_description": para,
"duration_estimate": duration_metadata.get("scene_duration_range", [5, 15])[0],
"emphasis": "hook" if idx == 1 else ("cta" if idx == len(paragraphs) else "main_content"),
"visual_cues": [],
"visual_prompt": para,
})
return scenes
def _enhance_visual_prompts_batch(
self,
scenes: List[Dict[str, Any]],
video_plan: Dict[str, Any],
user_id: str,
duration_type: str,
) -> List[Dict[str, Any]]:
"""
Efficiently enhance visual prompts based on video duration type.
Strategy:
- Shorts: Skip enhancement (use original descriptions) - 0 AI calls
- Medium: Batch enhance all scenes in 1 call - 1 AI call
- Long: Batch enhance in 2 calls (split scenes) - 2 AI calls max
"""
# For shorts, skip enhancement to save API calls
if duration_type == "shorts":
logger.info(
f"[YouTubeSceneBuilder] Skipping prompt enhancement for shorts "
f"({len(scenes)} scenes) to save API calls"
)
for scene in scenes:
scene["enhanced_visual_prompt"] = scene.get(
"visual_prompt", scene.get("visual_description", "")
)
return scenes
# Build story context for prompt enhancer
story_context = {
"story_setting": video_plan.get("visual_style", "cinematic"),
"story_tone": video_plan.get("tone", "professional"),
"writing_style": video_plan.get("visual_style", "cinematic"),
}
# Convert scenes to format expected by enhancer
scene_data_list = [
{
"scene_number": scene.get("scene_number", idx + 1),
"title": scene.get("title", ""),
"description": scene.get("visual_description", ""),
"image_prompt": scene.get("visual_prompt", ""),
}
for idx, scene in enumerate(scenes)
]
# For medium videos, enhance all scenes in one batch call
if duration_type == "medium":
logger.info(
f"[YouTubeSceneBuilder] Batch enhancing {len(scenes)} scenes "
f"for medium video in 1 AI call"
)
try:
# Use a single batch enhancement call
enhanced_prompts = self._batch_enhance_prompts(
scene_data_list, story_context, user_id
)
for idx, scene in enumerate(scenes):
scene["enhanced_visual_prompt"] = enhanced_prompts.get(
idx, scene.get("visual_prompt", scene.get("visual_description", ""))
)
except Exception as e:
logger.warning(
f"[YouTubeSceneBuilder] Batch enhancement failed: {e}, "
f"using original prompts"
)
for scene in scenes:
scene["enhanced_visual_prompt"] = scene.get(
"visual_prompt", scene.get("visual_description", "")
)
return scenes
# For long videos, split into 2 batches to avoid token limits
if duration_type == "long":
logger.info(
f"[YouTubeSceneBuilder] Batch enhancing {len(scenes)} scenes "
f"for long video in 2 AI calls"
)
mid_point = len(scenes) // 2
batches = [
scene_data_list[:mid_point],
scene_data_list[mid_point:],
]
all_enhanced = {}
for batch_idx, batch in enumerate(batches):
try:
enhanced = self._batch_enhance_prompts(
batch, story_context, user_id
)
start_idx = 0 if batch_idx == 0 else mid_point
for local_idx, enhanced_prompt in enhanced.items():
all_enhanced[start_idx + local_idx] = enhanced_prompt
except Exception as e:
logger.warning(
f"[YouTubeSceneBuilder] Batch {batch_idx + 1} enhancement "
f"failed: {e}, using original prompts"
)
start_idx = 0 if batch_idx == 0 else mid_point
for local_idx, scene_data in enumerate(batch):
all_enhanced[start_idx + local_idx] = scene_data.get(
"image_prompt", scene_data.get("description", "")
)
for idx, scene in enumerate(scenes):
scene["enhanced_visual_prompt"] = all_enhanced.get(
idx, scene.get("visual_prompt", scene.get("visual_description", ""))
)
return scenes
# Fallback: use original prompts
logger.warning(
f"[YouTubeSceneBuilder] Unknown duration type '{duration_type}', "
f"using original prompts"
)
for scene in scenes:
scene["enhanced_visual_prompt"] = scene.get(
"visual_prompt", scene.get("visual_description", "")
)
return scenes
def _batch_enhance_prompts(
self,
scene_data_list: List[Dict[str, Any]],
story_context: Dict[str, Any],
user_id: str,
) -> Dict[int, str]:
"""
Enhance multiple scene prompts in a single AI call.
Returns:
Dictionary mapping scene index to enhanced prompt
"""
try:
# Build batch enhancement prompt
scenes_text = "\n\n".join([
f"Scene {scene.get('scene_number', idx + 1)}: {scene.get('title', '')}\n"
f"Description: {scene.get('description', '')}\n"
f"Current Prompt: {scene.get('image_prompt', '')}"
for idx, scene in enumerate(scene_data_list)
])
batch_prompt = f"""You are optimizing visual prompts for AI video generation. Enhance the following scenes to be more detailed and video-optimized.
**Video Style Context:**
- Setting: {story_context.get('story_setting', 'cinematic')}
- Tone: {story_context.get('story_tone', 'professional')}
- Style: {story_context.get('writing_style', 'cinematic')}
**Scenes to Enhance:**
{scenes_text}
**Your Task:**
For each scene, create an enhanced visual prompt (200-300 words) that:
1. Is detailed and specific for video generation
2. Includes camera movements, lighting, composition
3. Maintains consistency with the video style
4. Is optimized for WAN 2.5 text-to-video model
**Format as JSON array with enhanced prompts:**
[
{{"scene_index": 0, "enhanced_prompt": "detailed enhanced prompt for scene 1..."}},
{{"scene_index": 1, "enhanced_prompt": "detailed enhanced prompt for scene 2..."}},
...
]
Make sure the array length matches the number of scenes provided ({len(scene_data_list)}).
"""
system_prompt = (
"You are an expert at creating detailed visual prompts for AI video generation. "
"Your prompts are specific, cinematic, and optimized for video models."
)
response = llm_text_gen(
prompt=batch_prompt,
system_prompt=system_prompt,
user_id=user_id,
json_struct={
"type": "array",
"items": {
"type": "object",
"properties": {
"scene_index": {"type": "number"},
"enhanced_prompt": {"type": "string"}
},
"required": ["scene_index", "enhanced_prompt"]
}
}
)
# Parse response
if isinstance(response, list):
enhanced_list = response
elif isinstance(response, str):
import json
enhanced_list = json.loads(response)
else:
enhanced_list = response
# Build result dictionary
result = {}
for item in enhanced_list:
idx = item.get("scene_index", 0)
prompt = item.get("enhanced_prompt", "")
if prompt:
result[idx] = prompt
else:
# Fallback to original
original_scene = scene_data_list[idx] if idx < len(scene_data_list) else {}
result[idx] = original_scene.get(
"image_prompt", original_scene.get("description", "")
)
# Fill in any missing scenes with original prompts
for idx in range(len(scene_data_list)):
if idx not in result:
original_scene = scene_data_list[idx]
result[idx] = original_scene.get(
"image_prompt", original_scene.get("description", "")
)
logger.info(
f"[YouTubeSceneBuilder] ✅ Batch enhanced {len(result)} prompts "
f"in 1 AI call"
)
return result
except Exception as e:
logger.error(
f"[YouTubeSceneBuilder] Batch enhancement failed: {e}",
exc_info=True
)
# Return original prompts as fallback
return {
idx: scene.get("image_prompt", scene.get("description", ""))
for idx, scene in enumerate(scene_data_list)
}