WIP: AI Podcast Maker and YouTube Creator Studio integration

2025-12-10 09:37:55 +05:30
parent 31f078c763
commit 81590cf4db
75 changed files with 11879 additions and 1380 deletions
--- a/backend/services/wavespeed/client.py
+++ b/backend/services/wavespeed/client.py
@@ -637,4 +637,260 @@ class WaveSpeedClient:
                status_code=502,
                detail="Failed to fetch generated audio from WaveSpeed URL",
            )
+    
+    def submit_text_to_video(
+        self,
+        model_path: str,
+        payload: Dict[str, Any],
+        timeout: int = 60,
+    ) -> str:
+        """
+        Submit a text-to-video generation request to WaveSpeed.
+        
+        Args:
+            model_path: Model path (e.g., "alibaba/wan-2.5/text-to-video")
+            payload: Request payload with prompt, resolution, duration, optional audio
+            timeout: Request timeout in seconds
+            
+        Returns:
+            Prediction ID for polling
+        """
+        url = f"{self.BASE_URL}/{model_path}"
+        logger.info(f"[WaveSpeed] Submitting text-to-video request to {url}")
+        response = requests.post(url, headers=self._headers(), json=payload, timeout=timeout)
+        
+        if response.status_code != 200:
+            logger.error(f"[WaveSpeed] Text-to-video submission failed: {response.status_code} {response.text}")
+            raise HTTPException(
+                status_code=502,
+                detail={
+                    "error": "WaveSpeed text-to-video submission failed",
+                    "status_code": response.status_code,
+                    "response": response.text,
+                },
+            )
+        
+        data = response.json().get("data")
+        if not data or "id" not in data:
+            logger.error(f"[WaveSpeed] Unexpected text-to-video response: {response.text}")
+            raise HTTPException(
+                status_code=502,
+                detail={"error": "WaveSpeed response missing prediction id"},
+            )
+        
+        prediction_id = data["id"]
+        logger.info(f"[WaveSpeed] Submitted text-to-video request: {prediction_id}")
+        return prediction_id
+    
+    def generate_text_video(
+        self,
+        prompt: str,
+        resolution: str = "720p",  # 480p, 720p, 1080p
+        duration: int = 5,  # 5 or 10 seconds
+        audio_base64: Optional[str] = None,  # Optional audio for lip-sync
+        negative_prompt: Optional[str] = None,
+        seed: Optional[int] = None,
+        enable_prompt_expansion: bool = True,
+        enable_sync_mode: bool = False,
+        timeout: int = 180,
+    ) -> Dict[str, Any]:
+        """
+        Generate video from text prompt using WAN 2.5 text-to-video.
+        
+        Args:
+            prompt: Text prompt describing the video
+            resolution: Output resolution (480p, 720p, 1080p)
+            duration: Video duration in seconds (5 or 10)
+            audio_base64: Optional audio file (wav/mp3, 3-30s, ≤15MB) for lip-sync
+            negative_prompt: Optional negative prompt
+            seed: Optional random seed for reproducibility
+            enable_prompt_expansion: Enable prompt optimizer
+            enable_sync_mode: If True, wait for result and return it directly
+            timeout: Request timeout in seconds
+            
+        Returns:
+            Dictionary with video bytes, metadata, and cost
+        """
+        model_path = "alibaba/wan-2.5/text-to-video"
+        
+        # Validate resolution
+        valid_resolutions = ["480p", "720p", "1080p"]
+        if resolution not in valid_resolutions:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Invalid resolution: {resolution}. Must be one of: {valid_resolutions}"
+            )
+        
+        # Validate duration
+        if duration not in [5, 10]:
+            raise HTTPException(
+                status_code=400,
+                detail="Duration must be 5 or 10 seconds"
+            )
+        
+        # Build payload
+        payload = {
+            "prompt": prompt,
+            "resolution": resolution,
+            "duration": duration,
+            "enable_prompt_expansion": enable_prompt_expansion,
+            "enable_sync_mode": enable_sync_mode,  # Add sync mode to payload
+        }
+        
+        # Add optional audio
+        if audio_base64:
+            payload["audio"] = audio_base64
+        
+        # Add optional parameters
+        if negative_prompt:
+            payload["negative_prompt"] = negative_prompt
+        if seed is not None:
+            payload["seed"] = seed
+        
+        # Submit request
+        logger.info(
+            f"[WaveSpeed] Generating text-to-video: resolution={resolution}, "
+            f"duration={duration}s, prompt_length={len(prompt)}, sync_mode={enable_sync_mode}"
+        )
+        
+        # For sync mode, submit and get result directly
+        if enable_sync_mode:
+            url = f"{self.BASE_URL}/{model_path}"
+            response = requests.post(url, headers=self._headers(), json=payload, timeout=timeout)
+            
+            if response.status_code != 200:
+                logger.error(f"[WaveSpeed] Text-to-video submission failed: {response.status_code} {response.text}")
+                raise HTTPException(
+                    status_code=502,
+                    detail={
+                        "error": "WaveSpeed text-to-video submission failed",
+                        "status_code": response.status_code,
+                        "response": response.text[:500],
+                    },
+                )
+            
+            response_json = response.json()
+            data = response_json.get("data") or response_json
+            
+            # In sync mode, result should be directly in outputs
+            outputs = data.get("outputs") or []
+            if not outputs:
+                logger.error(f"[WaveSpeed] No outputs in sync mode response: {response.text[:500]}")
+                raise HTTPException(
+                    status_code=502,
+                    detail="WaveSpeed text-to-video returned no outputs in sync mode",
+                )
+            
+            # Extract video URL from outputs
+            video_url = outputs[0]
+            if not isinstance(video_url, str) or not video_url.startswith("http"):
+                logger.error(f"[WaveSpeed] Invalid video URL format in sync mode: {video_url}")
+                raise HTTPException(
+                    status_code=502,
+                    detail=f"Invalid video URL format: {video_url}",
+                )
+            
+            # Download video
+            logger.info(f"[WaveSpeed] Downloading video from sync mode URL: {video_url}")
+            video_response = requests.get(video_url, timeout=180)
+            
+            if video_response.status_code != 200:
+                raise HTTPException(
+                    status_code=502,
+                    detail={
+                        "error": "Failed to download WAN 2.5 video from sync mode",
+                        "status_code": video_response.status_code,
+                        "response": video_response.text[:200],
+                    }
+                )
+            
+            video_bytes = video_response.content
+            prediction_id = data.get("id", "sync_mode")
+            metadata = data.get("metadata") or {}
+            # video_url is already set above for sync mode
+        else:
+            # Async mode - submit and poll
+            prediction_id = self.submit_text_to_video(model_path, payload, timeout=timeout)
+            
+            # Poll for completion
+            try:
+                result = self.poll_until_complete(
+                    prediction_id,
+                    timeout_seconds=timeout,
+                    interval_seconds=2.0
+                )
+            except HTTPException as e:
+                detail = e.detail or {}
+                if isinstance(detail, dict):
+                    detail.setdefault("prediction_id", prediction_id)
+                    detail.setdefault("resume_available", True)
+                raise HTTPException(status_code=e.status_code, detail=detail)
+            
+            # Extract video URL
+            outputs = result.get("outputs") or []
+            if not outputs:
+                raise HTTPException(
+                    status_code=502,
+                    detail="WAN 2.5 text-to-video completed but returned no outputs"
+                )
+            
+            video_url = outputs[0]
+            if not isinstance(video_url, str) or not video_url.startswith("http"):
+                raise HTTPException(
+                    status_code=502,
+                    detail=f"Invalid video URL format: {video_url}"
+                )
+            
+            # Download video
+            logger.info(f"[WaveSpeed] Downloading video from: {video_url}")
+            video_response = requests.get(video_url, timeout=180)
+            
+            if video_response.status_code != 200:
+                raise HTTPException(
+                    status_code=502,
+                    detail={
+                        "error": "Failed to download WAN 2.5 video",
+                        "status_code": video_response.status_code,
+                        "response": video_response.text[:200],
+                    }
+                )
+            
+            video_bytes = video_response.content
+            metadata = result.get("metadata") or {}
+        
+        # Calculate cost (same pricing as image-to-video)
+        pricing = {
+            "480p": 0.05,
+            "720p": 0.10,
+            "1080p": 0.15,
+        }
+        cost = pricing.get(resolution, 0.10) * duration
+        
+        # Get video dimensions
+        resolution_dims = {
+            "480p": (854, 480),
+            "720p": (1280, 720),
+            "1080p": (1920, 1080),
+        }
+        width, height = resolution_dims.get(resolution, (1280, 720))
+        
+        logger.info(
+            f"[WaveSpeed] ✅ Generated text-to-video: {len(video_bytes)} bytes, "
+            f"resolution={resolution}, duration={duration}s, cost=${cost:.2f}"
+        )
+        
+        return {
+            "video_bytes": video_bytes,
+            "prompt": prompt,
+            "duration": float(duration),
+            "model_name": "alibaba/wan-2.5/text-to-video",
+            "cost": cost,
+            "provider": "wavespeed",
+            "source_video_url": video_url,
+            "prediction_id": prediction_id,
+            "resolution": resolution,
+            "width": width,
+            "height": height,
+            "metadata": metadata,
+        }