AI Image Studio, AI podcast Maker, AI product Marketing

2025-11-28 14:33:52 +05:30
parent 77d7c0cde6
commit 49e2131715
122 changed files with 22311 additions and 4331 deletions
--- a/backend/services/image_studio/wan25_service.py
+++ b/backend/services/image_studio/wan25_service.py
@@ -0,0 +1,295 @@
+"""WAN 2.5 service for Alibaba image-to-video generation via WaveSpeed."""
+
+import base64
+import asyncio
+from typing import Any, Dict, Optional
+import requests
+from fastapi import HTTPException
+from loguru import logger
+
+from services.wavespeed.client import WaveSpeedClient
+from utils.logger_utils import get_service_logger
+
+logger = get_service_logger("image_studio.wan25")
+
+WAN25_MODEL_PATH = "alibaba/wan-2.5/image-to-video"
+WAN25_MODEL_NAME = "alibaba/wan-2.5/image-to-video"
+
+# Pricing per second (from WaveSpeed docs)
+PRICING = {
+    "480p": 0.05,   # $0.05 per second
+    "720p": 0.10,   # $0.10 per second
+    "1080p": 0.15,  # $0.15 per second
+}
+
+MAX_IMAGE_BYTES = 10 * 1024 * 1024  # 10MB (recommended)
+MAX_AUDIO_BYTES = 15 * 1024 * 1024  # 15MB (API limit)
+MIN_AUDIO_DURATION = 3  # seconds
+MAX_AUDIO_DURATION = 30  # seconds
+
+
+def _as_data_uri(content_bytes: bytes, mime_type: str) -> str:
+    """Convert bytes to data URI."""
+    encoded = base64.b64encode(content_bytes).decode("utf-8")
+    return f"data:{mime_type};base64,{encoded}"
+
+
+def _decode_base64_image(image_base64: str) -> tuple[bytes, str]:
+    """Decode base64 image, handling data URIs."""
+    if image_base64.startswith("data:"):
+        # Extract mime type and base64 data
+        if "," not in image_base64:
+            raise ValueError("Invalid data URI format: missing comma separator")
+        header, encoded = image_base64.split(",", 1)
+        mime_parts = header.split(":")[1].split(";")[0] if ":" in header else "image/png"
+        mime_type = mime_parts.strip()
+        if not mime_type:
+            mime_type = "image/png"
+        image_bytes = base64.b64decode(encoded)
+    else:
+        # Assume it's raw base64
+        image_bytes = base64.b64decode(image_base64)
+        mime_type = "image/png"  # Default
+    
+    return image_bytes, mime_type
+
+
+def _decode_base64_audio(audio_base64: str) -> tuple[bytes, str]:
+    """Decode base64 audio, handling data URIs."""
+    if audio_base64.startswith("data:"):
+        if "," not in audio_base64:
+            raise ValueError("Invalid data URI format: missing comma separator")
+        header, encoded = audio_base64.split(",", 1)
+        mime_parts = header.split(":")[1].split(";")[0] if ":" in header else "audio/mpeg"
+        mime_type = mime_parts.strip()
+        if not mime_type:
+            mime_type = "audio/mpeg"
+        audio_bytes = base64.b64decode(encoded)
+    else:
+        audio_bytes = base64.b64decode(audio_base64)
+        mime_type = "audio/mpeg"  # Default
+    
+    return audio_bytes, mime_type
+
+
+class WAN25Service:
+    """Service for Alibaba WAN 2.5 image-to-video generation."""
+    
+    def __init__(self, client: Optional[WaveSpeedClient] = None):
+        """Initialize WAN 2.5 service."""
+        self.client = client or WaveSpeedClient()
+        logger.info("[WAN 2.5] Service initialized")
+    
+    def calculate_cost(self, resolution: str, duration: int) -> float:
+        """Calculate cost for video generation.
+        
+        Args:
+            resolution: Output resolution (480p, 720p, 1080p)
+            duration: Video duration in seconds (5 or 10)
+            
+        Returns:
+            Cost in USD
+        """
+        cost_per_second = PRICING.get(resolution, PRICING["720p"])
+        return cost_per_second * duration
+    
+    async def generate_video(
+        self,
+        image_base64: str,
+        prompt: str,
+        audio_base64: Optional[str] = None,
+        resolution: str = "720p",
+        duration: int = 5,
+        negative_prompt: Optional[str] = None,
+        seed: Optional[int] = None,
+        enable_prompt_expansion: bool = True,
+    ) -> Dict[str, Any]:
+        """Generate video using WAN 2.5.
+        
+        Args:
+            image_base64: Image in base64 or data URI format
+            prompt: Text prompt describing the video
+            audio_base64: Optional audio file (wav/mp3, 3-30s, ≤15MB)
+            resolution: Output resolution (480p, 720p, 1080p)
+            duration: Video duration in seconds (5 or 10)
+            negative_prompt: Optional negative prompt
+            seed: Optional random seed for reproducibility
+            enable_prompt_expansion: Enable prompt optimizer
+            
+        Returns:
+            Dictionary with video bytes, metadata, and cost
+        """
+        # Validate resolution
+        if resolution not in PRICING:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Invalid resolution: {resolution}. Must be one of: {list(PRICING.keys())}"
+            )
+        
+        # Validate duration
+        if duration not in [5, 10]:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Invalid duration: {duration}. Must be 5 or 10 seconds"
+            )
+        
+        # Validate prompt
+        if not prompt or not prompt.strip():
+            raise HTTPException(
+                status_code=400,
+                detail="Prompt is required and cannot be empty"
+            )
+        
+        # Decode image
+        try:
+            image_bytes, image_mime = _decode_base64_image(image_base64)
+        except Exception as e:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Failed to decode image: {str(e)}"
+            )
+        
+        # Validate image size
+        if len(image_bytes) > MAX_IMAGE_BYTES:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Image exceeds {MAX_IMAGE_BYTES / (1024*1024):.0f}MB limit"
+            )
+        
+        # Build payload
+        payload = {
+            "image": _as_data_uri(image_bytes, image_mime),
+            "prompt": prompt,
+            "resolution": resolution,
+            "duration": duration,
+            "enable_prompt_expansion": enable_prompt_expansion,
+        }
+        
+        # Add optional audio
+        if audio_base64:
+            try:
+                audio_bytes, audio_mime = _decode_base64_audio(audio_base64)
+                
+                # Validate audio size
+                if len(audio_bytes) > MAX_AUDIO_BYTES:
+                    raise HTTPException(
+                        status_code=400,
+                        detail=f"Audio exceeds {MAX_AUDIO_BYTES / (1024*1024):.0f}MB limit"
+                    )
+                
+                # Note: Audio duration validation would require audio analysis
+                # For now, we rely on API to handle it (API keeps first 5s/10s if longer)
+                
+                payload["audio"] = _as_data_uri(audio_bytes, audio_mime)
+            except Exception as e:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Failed to decode audio: {str(e)}"
+                )
+        
+        # Add optional parameters
+        if negative_prompt:
+            payload["negative_prompt"] = negative_prompt
+        
+        if seed is not None:
+            payload["seed"] = seed
+        
+        # Submit to WaveSpeed
+        logger.info(
+            f"[WAN 2.5] Submitting video generation request: resolution={resolution}, duration={duration}s"
+        )
+        
+        try:
+            prediction_id = self.client.submit_image_to_video(
+                WAN25_MODEL_PATH,
+                payload,
+                timeout=60
+            )
+        except HTTPException as e:
+            logger.error(f"[WAN 2.5] Submission failed: {e.detail}")
+            raise
+        
+        # Poll for completion
+        logger.info(f"[WAN 2.5] Polling for completion: prediction_id={prediction_id}")
+        
+        try:
+            # WAN 2.5 typically takes 1-2 minutes
+            result = self.client.poll_until_complete(
+                prediction_id,
+                timeout_seconds=180,  # 3 minutes max
+                interval_seconds=2.0
+            )
+        except HTTPException as e:
+            detail = e.detail or {}
+            if isinstance(detail, dict):
+                detail.setdefault("prediction_id", prediction_id)
+                detail.setdefault("resume_available", True)
+            raise HTTPException(status_code=e.status_code, detail=detail)
+        
+        # Extract video URL
+        outputs = result.get("outputs") or []
+        if not outputs:
+            raise HTTPException(
+                status_code=502,
+                detail="WAN 2.5 completed but returned no outputs"
+            )
+        
+        video_url = outputs[0]
+        if not isinstance(video_url, str) or not video_url.startswith("http"):
+            raise HTTPException(
+                status_code=502,
+                detail=f"Invalid video URL format: {video_url}"
+            )
+        
+        # Download video (run synchronous request in thread)
+        logger.info(f"[WAN 2.5] Downloading video from: {video_url}")
+        video_response = await asyncio.to_thread(
+            requests.get,
+            video_url,
+            timeout=180
+        )
+        
+        if video_response.status_code != 200:
+            raise HTTPException(
+                status_code=502,
+                detail={
+                    "error": "Failed to download WAN 2.5 video",
+                    "status_code": video_response.status_code,
+                    "response": video_response.text[:200],
+                }
+            )
+        
+        video_bytes = video_response.content
+        metadata = result.get("metadata") or {}
+        
+        # Calculate cost
+        cost = self.calculate_cost(resolution, duration)
+        
+        # Get video dimensions from resolution
+        resolution_dims = {
+            "480p": (854, 480),
+            "720p": (1280, 720),
+            "1080p": (1920, 1080),
+        }
+        width, height = resolution_dims.get(resolution, (1280, 720))
+        
+        logger.info(
+            f"[WAN 2.5] ✅ Generated video: {len(video_bytes)} bytes, "
+            f"resolution={resolution}, duration={duration}s, cost=${cost:.2f}"
+        )
+        
+        return {
+            "video_bytes": video_bytes,
+            "prompt": prompt,
+            "duration": float(duration),
+            "model_name": WAN25_MODEL_NAME,
+            "cost": cost,
+            "provider": "wavespeed",
+            "source_video_url": video_url,
+            "prediction_id": prediction_id,
+            "resolution": resolution,
+            "width": width,
+            "height": height,
+            "metadata": metadata,
+        }
+