from __future__ import annotations import base64 from typing import Any, Dict, Optional import requests from fastapi import HTTPException from loguru import logger from .client import WaveSpeedClient INFINITALK_MODEL_PATH = "wavespeed-ai/infinitetalk" INFINITALK_MODEL_NAME = "wavespeed-ai/infinitetalk" INFINITALK_DEFAULT_COST = 0.30 # $0.30 per 5 seconds at 720p tier MAX_IMAGE_BYTES = 10 * 1024 * 1024 # 10MB MAX_AUDIO_BYTES = 50 * 1024 * 1024 # 50MB safety cap def _as_data_uri(content_bytes: bytes, mime_type: str) -> str: encoded = base64.b64encode(content_bytes).decode("utf-8") return f"data:{mime_type};base64,{encoded}" def _generate_simple_infinitetalk_prompt( scene_data: Dict[str, Any], story_context: Dict[str, Any], ) -> Optional[str]: """ Generate an enhanced prompt for InfiniteTalk video generation. Includes scene content, analysis, bible context, and visual elements. Returns None if no meaningful prompt can be generated. """ title = (scene_data.get("title") or "").strip() description = (scene_data.get("description") or "").strip() image_prompt = (scene_data.get("image_prompt") or "").strip() lines = scene_data.get("lines", []) narration = "" if lines: # Combine first few lines for context narration = " ".join([str(l.get("text", "")) for l in lines[:3]])[:150] # Build enhanced prompt with multiple context sources parts = [] # Add main scene title if title and len(title) > 5 and title.lower() not in ("scene", "podcast", "episode"): parts.append(title) # Add analysis context analysis = story_context.get("analysis", {}) if analysis: content_type = analysis.get("content_type") if content_type: parts.append(f"Content type: {content_type}") # Add key takeaways if available key_takeaways = analysis.get("keyTakeaways", []) if key_takeaways and isinstance(key_takeaways, list) and len(key_takeaways) > 0: takeaway = str(key_takeaways[0])[:80] if takeaway: parts.append(f"Key insight: {takeaway}") # Audience audience = analysis.get("audience") if audience: short_audience = " ".join(audience.split()[:3]) parts.append(f"Target audience: {short_audience}") # Guest info guest_name = analysis.get("guestName") guest_expertise = analysis.get("guestExpertise") if guest_name: parts.append(f"Guest: {guest_name}") if guest_expertise: parts.append(f"Expertise: {guest_expertise}") # Add bible context bible = story_context.get("bible", {}) if bible: host_persona = bible.get("host_persona") tone = bible.get("tone") visual_style = bible.get("visual_style") background = bible.get("background") if host_persona: parts.append(f"Host persona: {host_persona}") if tone: parts.append(f"Tone: {tone}") if visual_style: parts.append(f"Visual style: {visual_style}") if background: parts.append(f"Background: {background}") # Add original image prompt as fallback context if image_prompt and len(parts) < 3: img_part = image_prompt.split('.')[0][:100].strip() if img_part: parts.append(f"Visual context: {img_part}") # Add narration snippet if available if narration and len(parts) < 4: parts.append(f"Discussing: {narration}") if not parts: return None # Build prompt with visual quality keywords quality_keywords = "Cinematic lighting, high detail, 4k quality, smooth motion" # Combine parts into final prompt prompt = f"{'. '.join(parts)}. {quality_keywords}. With subtle natural movement." # Allow more room for detailed prompts - max 350 characters prompt = prompt[:350].strip() # Clean up trailing punctuation if prompt.endswith(',') or prompt.endswith('.'): prompt = prompt[:-1].strip() return prompt if len(prompt) >= 15 else None def animate_scene_with_voiceover( *, image_bytes: bytes, audio_bytes: bytes, scene_data: Dict[str, Any], story_context: Dict[str, Any], user_id: str, resolution: str = "720p", prompt_override: Optional[str] = None, mask_image_bytes: Optional[bytes] = None, seed: Optional[int] = -1, image_mime: str = "image/png", audio_mime: str = "audio/mpeg", client: Optional[WaveSpeedClient] = None, ) -> Dict[str, Any]: """ Animate a scene image with narration audio using WaveSpeed InfiniteTalk. Returns dict with video bytes, prompt used, model name, and cost. """ if not image_bytes: raise HTTPException(status_code=404, detail="Scene image bytes missing for animation.") if not audio_bytes: raise HTTPException(status_code=404, detail="Scene audio bytes missing for animation.") if len(image_bytes) > MAX_IMAGE_BYTES: raise HTTPException( status_code=400, detail="Scene image exceeds 10MB limit required by WaveSpeed InfiniteTalk.", ) if len(audio_bytes) > MAX_AUDIO_BYTES: raise HTTPException( status_code=400, detail="Scene audio exceeds 50MB limit allowed for InfiniteTalk requests.", ) if resolution not in {"480p", "720p"}: raise HTTPException(status_code=400, detail="Resolution must be '480p' or '720p'.") # Generate simple, concise prompt for InfiniteTalk (audio-driven, less need for elaborate descriptions) animation_prompt = prompt_override or _generate_simple_infinitetalk_prompt(scene_data, story_context) payload: Dict[str, Any] = { "image": _as_data_uri(image_bytes, image_mime), "audio": _as_data_uri(audio_bytes, audio_mime), "resolution": resolution, } # Only include prompt if we have a meaningful one (InfiniteTalk works fine without it) if animation_prompt: payload["prompt"] = animation_prompt if mask_image_bytes: payload["mask_image"] = _as_data_uri(mask_image_bytes, image_mime) if seed is not None: payload["seed"] = seed client = client or WaveSpeedClient() prediction_id = client.submit_image_to_video(INFINITALK_MODEL_PATH, payload, timeout=60) try: # Poll faster (0.5s) to mirror reference pattern; allow up to 10 minutes result = client.poll_until_complete(prediction_id, timeout_seconds=600, interval_seconds=0.5) except HTTPException as exc: detail = exc.detail or {} if isinstance(detail, dict): detail.setdefault("prediction_id", prediction_id) detail.setdefault("resume_available", True) raise outputs = result.get("outputs") or [] if not outputs: raise HTTPException(status_code=502, detail="WaveSpeed InfiniteTalk completed but returned no outputs.") video_url = outputs[0] video_response = requests.get(video_url, timeout=180) if video_response.status_code != 200: raise HTTPException( status_code=502, detail={ "error": "Failed to download InfiniteTalk video", "status_code": video_response.status_code, "response": video_response.text[:200], }, ) metadata = result.get("metadata") or {} duration = metadata.get("duration_seconds") or metadata.get("duration") or 0 logger.info( "[InfiniteTalk] Generated talking avatar video user=%s scene=%s resolution=%s size=%s bytes", user_id, scene_data.get("scene_number"), resolution, len(video_response.content), ) return { "video_bytes": video_response.content, "prompt": animation_prompt, "duration": duration or 5, "model_name": INFINITALK_MODEL_NAME, "cost": INFINITALK_DEFAULT_COST, "provider": "wavespeed", "source_video_url": video_url, "prediction_id": prediction_id, }