Base code

2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions
--- a/backend/services/youtube/renderer.py
+++ b/backend/services/youtube/renderer.py
@@ -0,0 +1,573 @@
+"""
+YouTube Video Renderer Service
+
+Handles video rendering using WAN 2.5 text-to-video and audio generation.
+"""
+
+from typing import Dict, Any, List, Optional
+from pathlib import Path
+import base64
+import uuid
+import requests
+from loguru import logger
+from fastapi import HTTPException
+
+from services.wavespeed.client import WaveSpeedClient
+from services.llm_providers.main_audio_generation import generate_audio
+from services.story_writer.video_generation_service import StoryVideoGenerationService
+from services.subscription import PricingService
+from services.subscription.preflight_validator import validate_scene_animation_operation
+from services.llm_providers.main_video_generation import track_video_usage
+from utils.logger_utils import get_service_logger
+from utils.asset_tracker import save_asset_to_library
+
+logger = get_service_logger("youtube.renderer")
+
+
+class YouTubeVideoRendererService:
+    """Service for rendering YouTube videos from scenes."""
+    
+    def __init__(self):
+        """Initialize the renderer service."""
+        self.wavespeed_client = WaveSpeedClient()
+        
+        # Video output directory
+        base_dir = Path(__file__).parent.parent.parent.parent
+        self.output_dir = base_dir / "youtube_videos"
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        
+        logger.info(f"[YouTubeRenderer] Initialized with output directory: {self.output_dir}")
+    
+    def render_scene_video(
+        self,
+        scene: Dict[str, Any],
+        video_plan: Dict[str, Any],
+        user_id: str,
+        resolution: str = "720p",
+        generate_audio_enabled: bool = True,
+        voice_id: str = "Wise_Woman",
+    ) -> Dict[str, Any]:
+        """
+        Render a single scene into a video.
+        
+        Args:
+            scene: Scene data with narration and visual prompts
+            video_plan: Original video plan for context
+            user_id: Clerk user ID
+            resolution: Video resolution (480p, 720p, 1080p)
+            generate_audio: Whether to generate narration audio
+            voice_id: Voice ID for audio generation
+            
+        Returns:
+            Dictionary with video metadata, bytes, and cost
+        """
+        try:
+            scene_number = scene.get("scene_number", 1)
+            narration = scene.get("narration", "").strip()
+            visual_prompt = (scene.get("enhanced_visual_prompt") or scene.get("visual_prompt", "")).strip()
+            duration_estimate = scene.get("duration_estimate", 5)
+            
+            # VALIDATION: Check inputs before making expensive API calls
+            if not visual_prompt:
+                raise HTTPException(
+                    status_code=400,
+                    detail={
+                        "error": f"Scene {scene_number} has no visual prompt",
+                        "scene_number": scene_number,
+                        "message": "Visual prompt is required for video generation",
+                        "user_action": "Please add a visual description for this scene before rendering.",
+                    }
+                )
+            
+            if len(visual_prompt) < 10:
+                logger.warning(
+                    f"[YouTubeRenderer] Scene {scene_number} has very short visual prompt "
+                    f"({len(visual_prompt)} chars), may result in poor quality"
+                )
+            
+            # Clamp duration to valid WAN 2.5 values (5 or 10 seconds)
+            duration = 5 if duration_estimate <= 7 else 10
+            
+            # Log asset usage status
+            has_existing_image = bool(scene.get("imageUrl"))
+            has_existing_audio = bool(scene.get("audioUrl"))
+            
+            logger.info(
+                f"[YouTubeRenderer] Rendering scene {scene_number}: "
+                f"resolution={resolution}, duration={duration}s, prompt_length={len(visual_prompt)}, "
+                f"has_existing_image={has_existing_image}, has_existing_audio={has_existing_audio}"
+            )
+            
+            # Use existing audio if available, otherwise generate if requested
+            audio_base64 = None
+            scene_audio_url = scene.get("audioUrl")
+            
+            if scene_audio_url:
+                # Load existing audio from URL
+                try:
+                    from pathlib import Path
+                    from urllib.parse import urlparse
+                    import requests
+                    
+                    logger.info(f"[YouTubeRenderer] Attempting to load existing audio for scene {scene_number} from URL: {scene_audio_url}")
+                    
+                    # Extract filename from URL (e.g., /api/youtube/audio/filename.mp3)
+                    parsed_url = urlparse(scene_audio_url)
+                    audio_filename = Path(parsed_url.path).name
+                    
+                    # Try to load from local file system first
+                    base_dir = Path(__file__).parent.parent.parent.parent
+                    youtube_audio_dir = base_dir / "youtube_audio"
+                    audio_path = youtube_audio_dir / audio_filename
+                    
+                    # Debug: If file not found, try to find it with flexible matching
+                    if not audio_path.exists():
+                        logger.debug(f"[YouTubeRenderer] Audio file not found at {audio_path}. Searching for alternative matches...")
+                        if youtube_audio_dir.exists():
+                            all_files = list(youtube_audio_dir.glob("*.mp3"))
+                            logger.debug(f"[YouTubeRenderer] Found {len(all_files)} MP3 files in directory")
+                            
+                            # Try to find a file that matches the scene (by scene number or title pattern)
+                            # The filename format is: scene_{scene_number}_{clean_title}_{unique_id}.mp3
+                            # Extract components from expected filename
+                            expected_parts = audio_filename.replace('.mp3', '').split('_')
+                            if len(expected_parts) >= 3:
+                                scene_num_str = expected_parts[1] if expected_parts[0] == 'scene' else None
+                                title_part = expected_parts[2] if len(expected_parts) > 2 else None
+                                
+                                # Try to find files matching scene number or title
+                                matching_files = []
+                                for f in all_files:
+                                    file_parts = f.stem.split('_')
+                                    if len(file_parts) >= 3 and file_parts[0] == 'scene':
+                                        file_scene_num = file_parts[1]
+                                        file_title = file_parts[2] if len(file_parts) > 2 else ''
+                                        
+                                        # Match by scene number (try both 0-indexed and 1-indexed)
+                                        if scene_num_str:
+                                            scene_num_int = int(scene_num_str)
+                                            file_scene_int = int(file_scene_num) if file_scene_num.isdigit() else None
+                                            if file_scene_int == scene_num_int or file_scene_int == scene_num_int - 1 or file_scene_int == scene_num_int + 1:
+                                                matching_files.append(f.name)
+                                        # Or match by title
+                                        elif title_part and title_part.lower() in file_title.lower():
+                                            matching_files.append(f.name)
+                                
+                                if matching_files:
+                                    logger.info(
+                                        f"[YouTubeRenderer] Found potential audio file matches for scene {scene_number}: {matching_files[:3]}. "
+                                        f"Expected: {audio_filename}"
+                                    )
+                                    # Try using the first match
+                                    alternative_path = youtube_audio_dir / matching_files[0]
+                                    if alternative_path.exists() and alternative_path.is_file():
+                                        logger.info(f"[YouTubeRenderer] Using alternative audio file: {matching_files[0]}")
+                                        audio_path = alternative_path
+                                        audio_filename = matching_files[0]
+                                    else:
+                                        logger.warning(f"[YouTubeRenderer] Alternative match found but file doesn't exist: {alternative_path}")
+                            else:
+                                # Show sample files for debugging
+                                sample_files = [f.name for f in all_files[:10] if f.name.startswith("scene_")]
+                                if sample_files:
+                                    logger.debug(f"[YouTubeRenderer] Sample scene audio files in directory: {sample_files}")
+                    
+                    if audio_path.exists() and audio_path.is_file():
+                        with open(audio_path, "rb") as f:
+                            audio_bytes = f.read()
+                        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+                        logger.info(f"[YouTubeRenderer] ✅ Using existing audio for scene {scene_number} from local file: {audio_filename} ({len(audio_bytes)} bytes)")
+                    else:
+                        # File not found locally - try loading from asset library
+                        logger.warning(
+                            f"[YouTubeRenderer] Audio file not found locally at {audio_path}. "
+                            f"Attempting to load from asset library (filename: {audio_filename})"
+                        )
+                        
+                        try:
+                            from services.content_asset_service import ContentAssetService
+                            from services.database import get_db
+                            from models.content_asset_models import AssetType, AssetSource
+                            
+                            db = next(get_db())
+                            try:
+                                asset_service = ContentAssetService(db)
+                                # Try to find the asset by filename and source
+                                assets = asset_service.get_assets(
+                                    user_id=user_id,
+                                    asset_type=AssetType.AUDIO,
+                                    source_module=AssetSource.YOUTUBE_CREATOR,
+                                    limit=100,
+                                )
+                                
+                                # Find matching asset by filename
+                                matching_asset = None
+                                for asset in assets:
+                                    if asset.filename == audio_filename:
+                                        matching_asset = asset
+                                        break
+                                
+                                if matching_asset and matching_asset.file_path:
+                                    asset_path = Path(matching_asset.file_path)
+                                    if asset_path.exists() and asset_path.is_file():
+                                        with open(asset_path, "rb") as f:
+                                            audio_bytes = f.read()
+                                        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+                                        logger.info(
+                                            f"[YouTubeRenderer] ✅ Loaded audio for scene {scene_number} from asset library: "
+                                            f"{audio_filename} ({len(audio_bytes)} bytes)"
+                                        )
+                                    else:
+                                        raise FileNotFoundError(f"Asset library file path does not exist: {asset_path}")
+                                else:
+                                    raise FileNotFoundError(f"Audio asset not found in library for filename: {audio_filename}")
+                            finally:
+                                db.close()
+                        except Exception as asset_error:
+                            logger.warning(
+                                f"[YouTubeRenderer] Failed to load audio from asset library: {asset_error}. "
+                                f"Original path attempted: {audio_path}"
+                            )
+                            raise FileNotFoundError(
+                                f"Audio file not found at {audio_path} and not found in asset library: {asset_error}"
+                            )
+                                
+                except FileNotFoundError as e:
+                    logger.warning(f"[YouTubeRenderer] ❌ Audio file not found: {e}. Will generate new audio if enabled.")
+                    scene_audio_url = None  # Fall back to generation
+                except Exception as e:
+                    logger.warning(f"[YouTubeRenderer] ❌ Failed to load existing audio: {e}. Will generate new audio if enabled.", exc_info=True)
+                    scene_audio_url = None  # Fall back to generation
+            
+            # Generate audio if not available and generation is enabled
+            if not audio_base64 and generate_audio_enabled and narration and len(narration.strip()) > 0:
+                try:
+                    audio_result = generate_audio(
+                        text=narration,
+                        voice_id=voice_id,
+                        user_id=user_id,
+                    )
+                    # generate_audio may return raw bytes or AudioGenerationResult
+                    audio_bytes = audio_result.audio_bytes if hasattr(audio_result, "audio_bytes") else audio_result
+                    # Convert to base64 (just the base64 string, not data URI)
+                    audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+                    logger.info(f"[YouTubeRenderer] Generated new audio for scene {scene_number}")
+                except Exception as e:
+                    logger.warning(f"[YouTubeRenderer] Audio generation failed: {e}, continuing without audio")
+            
+            # VALIDATION: Final check before expensive video API call
+            if not visual_prompt or len(visual_prompt.strip()) < 5:
+                raise HTTPException(
+                    status_code=400,
+                    detail={
+                        "error": f"Scene {scene_number} has invalid visual prompt",
+                        "scene_number": scene_number,
+                        "message": "Visual prompt must be at least 5 characters",
+                        "user_action": "Please provide a valid visual description for this scene.",
+                    }
+                )
+            
+            # Generate video using WAN 2.5 text-to-video
+            # This is the expensive API call - all validation should be done before this
+            # Use sync mode to wait for result directly (prevents timeout issues)
+            try:
+                video_result = self.wavespeed_client.generate_text_video(
+                    prompt=visual_prompt,
+                    resolution=resolution,
+                    duration=duration,
+                    audio_base64=audio_base64,  # Optional: enables lip-sync if provided
+                    enable_prompt_expansion=True,
+                    enable_sync_mode=True,  # Use sync mode to wait for result directly
+                    timeout=600,  # Increased timeout for sync mode (10 minutes)
+                )
+            except requests.exceptions.Timeout as e:
+                logger.error(f"[YouTubeRenderer] WaveSpeed API timed out for scene {scene_number}: {e}")
+                raise HTTPException(
+                    status_code=504,
+                    detail={
+                        "error": "WaveSpeed request timed out",
+                        "scene_number": scene_number,
+                        "message": "The video generation request timed out.",
+                        "user_action": "Please retry. If it persists, try fewer scenes, lower resolution, or shorter durations.",
+                    },
+                ) from e
+            except requests.exceptions.RequestException as e:
+                logger.error(f"[YouTubeRenderer] WaveSpeed API request failed for scene {scene_number}: {e}")
+                raise HTTPException(
+                    status_code=502,
+                    detail={
+                        "error": "WaveSpeed request failed",
+                        "scene_number": scene_number,
+                        "message": str(e),
+                        "user_action": "Please retry. If it persists, check network connectivity or try again later.",
+                    },
+                ) from e
+            
+            # Save scene video
+            video_service = StoryVideoGenerationService(output_dir=str(self.output_dir))
+            save_result = video_service.save_scene_video(
+                video_bytes=video_result["video_bytes"],
+                scene_number=scene_number,
+                user_id=user_id,
+            )
+            
+            # Update video URL to use YouTube API endpoint
+            filename = save_result["video_filename"]
+            save_result["video_url"] = f"/api/youtube/videos/{filename}"
+            
+            # Track usage
+            usage_info = track_video_usage(
+                user_id=user_id,
+                provider=video_result["provider"],
+                model_name=video_result["model_name"],
+                prompt=visual_prompt,
+                video_bytes=video_result["video_bytes"],
+                cost_override=video_result["cost"],
+            )
+            
+            logger.info(
+                f"[YouTubeRenderer] ✅ Scene {scene_number} rendered: "
+                f"cost=${video_result['cost']:.2f}, size={len(video_result['video_bytes'])} bytes"
+            )
+            
+            return {
+                "scene_number": scene_number,
+                "video_filename": save_result["video_filename"],
+                "video_url": save_result["video_url"],
+                "video_path": save_result["video_path"],
+                "duration": video_result["duration"],
+                "cost": video_result["cost"],
+                "resolution": resolution,
+                "width": video_result["width"],
+                "height": video_result["height"],
+                "file_size": save_result["file_size"],
+                "prediction_id": video_result.get("prediction_id"),
+                "usage_info": usage_info,
+            }
+            
+        except HTTPException as e:
+            # Re-raise with better error message for UI
+            error_detail = e.detail
+            if isinstance(error_detail, dict):
+                error_msg = error_detail.get("error", str(error_detail))
+            else:
+                error_msg = str(error_detail)
+            
+            logger.error(
+                f"[YouTubeRenderer] Scene {scene_number} failed: {error_msg}",
+                exc_info=True
+            )
+            raise HTTPException(
+                status_code=e.status_code,
+                detail={
+                    "error": f"Failed to render scene {scene_number}",
+                    "scene_number": scene_number,
+                    "message": error_msg,
+                    "user_action": "Please try again. If the issue persists, check your scene content and try a different resolution.",
+                }
+            )
+        except Exception as e:
+            logger.error(f"[YouTubeRenderer] Error rendering scene {scene_number}: {e}", exc_info=True)
+            raise HTTPException(
+                status_code=500,
+                detail={
+                    "error": f"Failed to render scene {scene_number}",
+                    "scene_number": scene_number,
+                    "message": str(e),
+                    "user_action": "Please try again. If the issue persists, check your scene content and try a different resolution.",
+                }
+            )
+    
+    def render_full_video(
+        self,
+        scenes: List[Dict[str, Any]],
+        video_plan: Dict[str, Any],
+        user_id: str,
+        resolution: str = "720p",
+        combine_scenes: bool = True,
+        voice_id: str = "Wise_Woman",
+    ) -> Dict[str, Any]:
+        """
+        Render a complete video from multiple scenes.
+        
+        Args:
+            scenes: List of scene data
+            video_plan: Original video plan
+            user_id: Clerk user ID
+            resolution: Video resolution
+            combine_scenes: Whether to combine scenes into single video
+            voice_id: Voice ID for narration
+            
+        Returns:
+            Dictionary with video metadata and scene results
+        """
+        try:
+            logger.info(
+                f"[YouTubeRenderer] Rendering full video: {len(scenes)} scenes, "
+                f"resolution={resolution}, user={user_id}"
+            )
+            
+            # Filter enabled scenes
+            enabled_scenes = [s for s in scenes if s.get("enabled", True)]
+            if not enabled_scenes:
+                raise HTTPException(status_code=400, detail="No enabled scenes to render")
+            
+            scene_results = []
+            total_cost = 0.0
+            
+            # Render each scene
+            for idx, scene in enumerate(enabled_scenes):
+                logger.info(
+                    f"[YouTubeRenderer] Rendering scene {idx + 1}/{len(enabled_scenes)}: "
+                    f"Scene {scene.get('scene_number', idx + 1)}"
+                )
+                
+                scene_result = self.render_scene_video(
+                    scene=scene,
+                    video_plan=video_plan,
+                    user_id=user_id,
+                    resolution=resolution,
+                    generate_audio_enabled=True,
+                    voice_id=voice_id,
+                )
+                
+                scene_results.append(scene_result)
+                total_cost += scene_result["cost"]
+            
+            # Combine scenes if requested
+            final_video_path = None
+            final_video_url = None
+            if combine_scenes and len(scene_results) > 1:
+                logger.info("[YouTubeRenderer] Combining scenes into final video...")
+                
+                # Prepare data for video concatenation
+                scene_video_paths = [r["video_path"] for r in scene_results]
+                scene_audio_paths = [r.get("audio_path") for r in scene_results if r.get("audio_path")]
+                
+                # Use StoryVideoGenerationService to combine
+                video_service = StoryVideoGenerationService(output_dir=str(self.output_dir))
+                
+                # Create scene dicts for concatenation
+                scene_dicts = [
+                    {
+                        "scene_number": r["scene_number"],
+                        "title": f"Scene {r['scene_number']}",
+                    }
+                    for r in scene_results
+                ]
+                
+                combined_result = video_service.generate_story_video(
+                    scenes=scene_dicts,
+                    image_paths=[None] * len(scene_results),  # No static images
+                    audio_paths=scene_audio_paths if scene_audio_paths else [],
+                    video_paths=scene_video_paths,  # Use rendered videos
+                    user_id=user_id,
+                    story_title=video_plan.get("video_summary", "YouTube Video")[:50],
+                    fps=24,
+                )
+                
+                final_video_path = combined_result["video_path"]
+                final_video_url = combined_result["video_url"]
+            
+            logger.info(
+                f"[YouTubeRenderer] ✅ Full video rendered: {len(scene_results)} scenes, "
+                f"total_cost=${total_cost:.2f}"
+            )
+            
+            return {
+                "success": True,
+                "scene_results": scene_results,
+                "total_cost": total_cost,
+                "final_video_path": final_video_path,
+                "final_video_url": final_video_url,
+                "num_scenes": len(scene_results),
+                "resolution": resolution,
+            }
+            
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(f"[YouTubeRenderer] Error rendering full video: {e}", exc_info=True)
+            raise HTTPException(
+                status_code=500,
+                detail=f"Failed to render video: {str(e)}"
+            )
+    
+    def estimate_render_cost(
+        self,
+        scenes: List[Dict[str, Any]],
+        resolution: str = "720p",
+        image_model: str = "ideogram-v3-turbo",
+    ) -> Dict[str, Any]:
+        """
+        Estimate the cost of rendering a video before actually rendering it.
+        
+        Args:
+            scenes: List of scene data with duration estimates
+            resolution: Video resolution (480p, 720p, 1080p)
+            
+        Returns:
+            Dictionary with cost breakdown and total estimate
+        """
+        # Pricing per second (same as in WaveSpeedClient)
+        pricing = {
+            "480p": 0.05,
+            "720p": 0.10,
+            "1080p": 0.15,
+        }
+
+        price_per_second = pricing.get(resolution, 0.10)
+
+        # Image generation pricing
+        image_pricing = {
+            "ideogram-v3-turbo": 0.10,
+            "qwen-image": 0.05,
+        }
+
+        image_cost_per_scene = image_pricing.get(image_model, 0.10)
+        
+        # Filter enabled scenes
+        enabled_scenes = [s for s in scenes if s.get("enabled", True)]
+        
+        scene_costs = []
+        total_cost = 0.0
+        total_duration = 0.0
+        total_image_cost = len(enabled_scenes) * image_cost_per_scene
+
+        for scene in enabled_scenes:
+            scene_number = scene.get("scene_number", 0)
+            duration_estimate = scene.get("duration_estimate", 5)
+            
+            # Clamp duration to valid WAN 2.5 values (5 or 10 seconds)
+            duration = 5 if duration_estimate <= 7 else 10
+            
+            scene_cost = price_per_second * duration
+            scene_costs.append({
+                "scene_number": scene_number,
+                "duration_estimate": duration_estimate,
+                "actual_duration": duration,
+                "cost": round(scene_cost, 2),
+            })
+            
+            total_cost += scene_cost
+            total_duration += duration
+
+        # Add image costs to total
+        total_cost += total_image_cost
+
+        return {
+            "resolution": resolution,
+            "price_per_second": price_per_second,
+            "num_scenes": len(enabled_scenes),
+            "total_duration_seconds": total_duration,
+            "scene_costs": scene_costs,
+            "total_cost": round(total_cost, 2),
+            "estimated_cost_range": {
+                "min": round(total_cost * 0.9, 2),  # 10% buffer
+                "max": round(total_cost * 1.1, 2),  # 10% buffer
+            },
+            "image_model": image_model,
+            "image_cost_per_scene": image_cost_per_scene,
+            "total_image_cost": round(total_image_cost, 2),
+        }
+