AI Image and Audio Generation Improvements.

AI Video Generation Pre-Flight Checklist. Cost Estimate Improvements.
2025-12-25 16:26:08 +05:30
parent 59913bffa9
commit 7512933c65
163 changed files with 8938 additions and 37401 deletions
--- a/backend/services/youtube/renderer.py
+++ b/backend/services/youtube/renderer.py
@@ -88,14 +88,49 @@ class YouTubeVideoRendererService:
            # Clamp duration to valid WAN 2.5 values (5 or 10 seconds)
            duration = 5 if duration_estimate <= 7 else 10
            
+            # Log asset usage status
+            has_existing_image = bool(scene.get("imageUrl"))
+            has_existing_audio = bool(scene.get("audioUrl"))
+            
            logger.info(
                f"[YouTubeRenderer] Rendering scene {scene_number}: "
-                f"resolution={resolution}, duration={duration}s, prompt_length={len(visual_prompt)}"
+                f"resolution={resolution}, duration={duration}s, prompt_length={len(visual_prompt)}, "
+                f"has_existing_image={has_existing_image}, has_existing_audio={has_existing_audio}"
            )
            
-            # Generate audio if requested - only if narration is not empty
+            # Use existing audio if available, otherwise generate if requested
            audio_base64 = None
-            if generate_audio_enabled and narration and len(narration.strip()) > 0:
+            scene_audio_url = scene.get("audioUrl")
+            
+            if scene_audio_url:
+                # Load existing audio from URL
+                try:
+                    from pathlib import Path
+                    from urllib.parse import urlparse
+                    
+                    # Extract filename from URL (e.g., /api/youtube/audio/filename.mp3)
+                    parsed_url = urlparse(scene_audio_url)
+                    audio_filename = Path(parsed_url.path).name
+                    
+                    # Load audio file
+                    base_dir = Path(__file__).parent.parent.parent.parent
+                    youtube_audio_dir = base_dir / "youtube_audio"
+                    audio_path = youtube_audio_dir / audio_filename
+                    
+                    if audio_path.exists():
+                        with open(audio_path, "rb") as f:
+                            audio_bytes = f.read()
+                        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+                        logger.info(f"[YouTubeRenderer] Using existing audio for scene {scene_number} from {audio_filename}")
+                    else:
+                        logger.warning(f"[YouTubeRenderer] Audio file not found: {audio_path}, will generate new audio")
+                        raise FileNotFoundError(f"Audio file not found: {audio_path}")
+                except Exception as e:
+                    logger.warning(f"[YouTubeRenderer] Failed to load existing audio: {e}, will generate new audio")
+                    scene_audio_url = None  # Fall back to generation
+            
+            # Generate audio if not available and generation is enabled
+            if not audio_base64 and generate_audio_enabled and narration and len(narration.strip()) > 0:
                try:
                    audio_result = generate_audio(
                        text=narration,
@@ -106,7 +141,7 @@ class YouTubeVideoRendererService:
                    audio_bytes = audio_result.audio_bytes if hasattr(audio_result, "audio_bytes") else audio_result
                    # Convert to base64 (just the base64 string, not data URI)
                    audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
-                    logger.info(f"[YouTubeRenderer] Generated audio for scene {scene_number}")
+                    logger.info(f"[YouTubeRenderer] Generated new audio for scene {scene_number}")
                except Exception as e:
                    logger.warning(f"[YouTubeRenderer] Audio generation failed: {e}, continuing without audio")
            
@@ -352,6 +387,7 @@ class YouTubeVideoRendererService:
        self,
        scenes: List[Dict[str, Any]],
        resolution: str = "720p",
+        image_model: str = "ideogram-v3-turbo",
    ) -> Dict[str, Any]:
        """
        Estimate the cost of rendering a video before actually rendering it.
@@ -369,8 +405,16 @@ class YouTubeVideoRendererService:
            "720p": 0.10,
            "1080p": 0.15,
        }
-        
+
        price_per_second = pricing.get(resolution, 0.10)
+
+        # Image generation pricing
+        image_pricing = {
+            "ideogram-v3-turbo": 0.10,
+            "qwen-image": 0.05,
+        }
+
+        image_cost_per_scene = image_pricing.get(image_model, 0.10)
        
        # Filter enabled scenes
        enabled_scenes = [s for s in scenes if s.get("enabled", True)]
@@ -378,7 +422,8 @@ class YouTubeVideoRendererService:
        scene_costs = []
        total_cost = 0.0
        total_duration = 0.0
-        
+        total_image_cost = len(enabled_scenes) * image_cost_per_scene
+
        for scene in enabled_scenes:
            scene_number = scene.get("scene_number", 0)
            duration_estimate = scene.get("duration_estimate", 5)
@@ -396,7 +441,10 @@ class YouTubeVideoRendererService:
            
            total_cost += scene_cost
            total_duration += duration
-        
+
+        # Add image costs to total
+        total_cost += total_image_cost
+
        return {
            "resolution": resolution,
            "price_per_second": price_per_second,
@@ -408,5 +456,8 @@ class YouTubeVideoRendererService:
                "min": round(total_cost * 0.9, 2),  # 10% buffer
                "max": round(total_cost * 1.1, 2),  # 10% buffer
            },
+            "image_model": image_model,
+            "image_cost_per_scene": image_cost_per_scene,
+            "total_image_cost": round(total_image_cost, 2),
        }

--- a/backend/services/youtube/scene_builder.py
+++ b/backend/services/youtube/scene_builder.py
@@ -140,61 +140,87 @@ class YouTubeSceneBuilderService:
        
        scene_duration_range = duration_metadata.get("scene_duration_range", (5, 15))
        
-        scene_generation_prompt = f"""You are an expert video scriptwriter. Create detailed scenes for a YouTube video based on this plan.
+        scene_generation_prompt = f"""You are a top YouTube scriptwriter specializing in engaging, viral content. Create compelling scenes that captivate viewers and maximize watch time.

-**Video Plan:**
- Summary: {video_plan.get('video_summary', '')}
- Goal: {video_plan.get('video_goal', '')}
- Key Message: {video_plan.get('key_message', '')}
- Visual Style: {visual_style}
- Tone: {tone}
+**VIDEO PLAN:**
+📝 Summary: {video_plan.get('video_summary', '')}
+🎯 Goal: {video_plan.get('video_goal', '')}
+💡 Key Message: {video_plan.get('key_message', '')}
+🎨 Visual Style: {visual_style}
+🎭 Tone: {tone}

-**Hook Strategy:**
+**🎣 HOOK STRATEGY:**
 {hook_strategy}

-**Content Outline:**
-{chr(10).join([f"- {section.get('section', '')}: {section.get('description', '')} ({section.get('duration_estimate', 0)}s)" for section in content_outline])}
+**📋 CONTENT STRUCTURE:**
+{chr(10).join([f"• {section.get('section', '')}: {section.get('description', '')} ({section.get('duration_estimate', 0)}s)" for section in content_outline])}

-**Call-to-Action:**
+**🚀 CALL-TO-ACTION:**
 {call_to_action}

-**Duration Constraints:**
- Scene duration: {scene_duration_range[0]}-{scene_duration_range[1]} seconds each
- Total target: {duration_metadata.get('target_seconds', 150)} seconds
+**⏱️ TIMING CONSTRAINTS:**
+• Scene duration: {scene_duration_range[0]}-{scene_duration_range[1]} seconds each
+• Total target: {duration_metadata.get('target_seconds', 150)} seconds

-**Your Task:**
-Create detailed scenes that include:
-1. Scene number and title
-2. Narration text (what will be spoken)
-3. Visual description (what viewers will see)
-4. Duration estimate
-5. Emphasis tags (hook, main_content, transition, cta)
+**🎬 YOUR MISSION - CREATE VIRAL-WORTHY SCENES:**

-**Format as JSON array:**
+Write narration that:
+✨ **HOOKS IMMEDIATELY** - First {duration_metadata.get('hook_seconds', 10)}s must GRAB attention
+🎭 **TELLS A STORY** - Each scene advances the narrative with emotional engagement
+💡 **DELIVERS VALUE** - Provide insights, tips, or "aha!" moments in every scene
+🔥 **BUILDS EXCITEMENT** - Use power words, questions, and cliffhangers
+👥 **CONNECTS PERSONALLY** - Speak directly to the viewer's needs and desires
+⚡ **MAINTAINS PACE** - Vary sentence length for natural rhythm
+🎯 **DRIVES ACTION** - Build toward the CTA with increasing urgency
+
+**REQUIRED SCENE ELEMENTS:**
+1. **scene_number**: Sequential numbering
+2. **title**: Catchy, descriptive title (5-8 words max)
+3. **narration**: ENGAGING spoken script with:
+   - Conversational language ("you know what I mean?")
+   - Rhetorical questions ("Have you ever wondered...?")
+   - Power transitions ("But here's the game-changer...")
+   - Emotional hooks ("Imagine this...")
+   - Action-oriented language ("Let's dive in...")
+4. **visual_description**: Cinematic, professional YouTube visuals
+5. **duration_estimate**: Realistic speaking time
+6. **emphasis**: hook/main_content/transition/cta
+7. **visual_cues**: ["dramatic_zoom", "text_overlay", "fast_cuts"]
+
+**🎯 YOUTUBE OPTIMIZATION RULES:**
+• **Hook Power**: First 3 seconds = make them stay or lose them
+• **Value Density**: Every 10 seconds must deliver new insight
+• **Emotional Arc**: Build curiosity → teach → inspire → convert
+• **Natural Flow**: Scenes must connect seamlessly
+• **CTA Momentum**: Final scene creates irresistible urge to act
+
+**📊 FORMAT AS JSON ARRAY:**
 [
  {{
    "scene_number": 1,
-    "title": "Hook - Attention Grabber",
-    "narration": "The spoken text for this scene...",
-    "visual_description": "Detailed description of what viewers see...",
-    "duration_estimate": 5,
+    "title": "The Shocking Truth They Hide",
+    "narration": "You won't believe what just happened in my latest discovery! I was scrolling through the usual content when BAM - this completely changed everything I thought about [topic]. And get this - it could transform YOUR results too!",
+    "visual_description": "Dynamic opening shot with shocking text overlay, fast cuts of social media feeds, energetic music swell, close-up of surprised reaction",
+    "duration_estimate": 8,
    "emphasis": "hook",
-    "visual_cues": ["close-up", "dynamic", "bright"]
+    "visual_cues": ["shocking_text", "fast_cuts", "music_swell", "reaction_shot"]
  }},
  ...
 ]

-Make sure:
- First scene is a strong hook ({duration_metadata.get('hook_seconds', 10)}s)
- Last scene includes the CTA ({duration_metadata.get('cta_seconds', 10)}s)
- Each scene has clear narration and visual description
- Total duration fits within {duration_metadata.get('target_seconds', 150)} seconds
- Scenes flow naturally from one to the next
-"""
+**🔥 SUCCESS CRITERIA:**
+✅ First scene hooks in 3 seconds
+✅ Each scene delivers 1-2 key insights
+✅ Narration feels like talking to a friend
+✅ Total story arc creates emotional journey
+✅ CTA feels like the natural next step
+✅ Scenes fit duration perfectly"""
        
        system_prompt = (
-            "You are an expert video scriptwriter specializing in YouTube content. "
-            "Your scenes are engaging, well-paced, and optimized for viewer retention."
+            "You are a master YouTube scriptwriter who creates viral, engaging content that "
+            "keeps viewers watching until the end. You understand YouTube algorithm optimization, "
+            "emotional storytelling, and creating irresistible hooks that make viewers hit 'like' and 'subscribe'. "
+            "Your scripts are conversational, valuable, and conversion-focused."
        )
        
        response = llm_text_gen(