from __future__ import annotations import base64 import json from typing import Any, Dict, Optional import requests from fastapi import HTTPException from services.llm_providers.main_text_generation import llm_text_gen from utils.logger_utils import get_service_logger from .client import WaveSpeedClient try: import imghdr except ModuleNotFoundError: # Python 3.13 removed imghdr imghdr = None logger = get_service_logger("wavespeed.kling_animation") KLING_MODEL_PATH = "kwaivgi/kling-v2.5-turbo-std/image-to-video" KLING_MODEL_5S = "kling-v2.5-turbo-std-5s" KLING_MODEL_10S = "kling-v2.5-turbo-std-10s" MAX_IMAGE_BYTES = 10 * 1024 * 1024 # 10 MB limit per docs def _detect_image_mime(image_bytes: bytes) -> str: if imghdr: detected = imghdr.what(None, h=image_bytes) if detected == "jpeg": return "image/jpeg" if detected == "png": return "image/png" if detected == "gif": return "image/gif" header = image_bytes[:8] if header.startswith(b"\x89PNG"): return "image/png" if header[:2] == b"\xff\xd8": return "image/jpeg" if header[:3] in (b"GIF", b"GIF"): return "image/gif" return "image/png" def _build_fallback_prompt(scene_data: Dict[str, Any], story_context: Dict[str, Any]) -> str: title = (scene_data.get("title") or "Scene").strip() description = (scene_data.get("description") or "").strip() image_prompt = (scene_data.get("image_prompt") or "").strip() tone = (story_context.get("story_tone") or "story").strip() setting = (story_context.get("story_setting") or "the scene").strip() parts = [ f"{title} cinematic motion shot.", description[:220] if description else "", f"Camera glides with subtle parallax over {setting}.", f"Maintain a {tone} mood with natural lighting accents.", f"Honor the original illustration details: {image_prompt[:200]}." if image_prompt else "", "5-second sequence, gentle push-in, flowing cloth and atmospheric particles.", ] fallback_prompt = " ".join(filter(None, parts)) return fallback_prompt.strip() def _load_llm_json_response(response_text: Any) -> Dict[str, Any]: """Normalize responses from llm_text_gen (dict or JSON string).""" if isinstance(response_text, dict): return response_text if isinstance(response_text, str): return json.loads(response_text) raise ValueError(f"Unexpected response type: {type(response_text)}") def _generate_text_prompt( *, prompt: str, system_prompt: str, user_id: str, fallback_prompt: str, ) -> str: """Fallback text generation when structured JSON parsing fails.""" try: response = llm_text_gen( prompt=prompt.strip(), system_prompt=system_prompt, user_id=user_id, ) except HTTPException as exc: if exc.status_code == 429: raise logger.warning( "[AnimateScene] Text-mode prompt generation failed (%s). Using deterministic fallback.", exc.detail, ) return fallback_prompt except Exception as exc: logger.error( "[AnimateScene] Unexpected error generating text prompt: %s", exc, exc_info=True, ) return fallback_prompt if isinstance(response, dict): candidates = [ response.get("animation_prompt"), response.get("prompt"), response.get("text"), ] for candidate in candidates: if isinstance(candidate, str) and candidate.strip(): return candidate.strip() # As a last resort, stringify the dict response_text = json.dumps(response, ensure_ascii=False) else: response_text = str(response) cleaned = response_text.strip() return cleaned or fallback_prompt def generate_animation_prompt( scene_data: Dict[str, Any], story_context: Dict[str, Any], user_id: str, ) -> str: """ Generate an animation-focused prompt using llm_text_gen, falling back to a deterministic prompt if LLM fails. """ fallback_prompt = _build_fallback_prompt(scene_data, story_context) system_prompt = ( "You are an expert cinematic animation director. " "You transform static illustrated scenes into short cinematic motion clips. " "Describe motion, camera behavior, atmosphere, and pacing." ) description = scene_data.get("description", "") image_prompt = scene_data.get("image_prompt", "") title = scene_data.get("title", "") tone = story_context.get("story_tone") or story_context.get("story_tone", "") setting = story_context.get("story_setting") or story_context.get("story_setting", "") prompt = f""" Create a concise animation prompt (2-3 sentences) for a 5-second cinematic clip. Scene Title: {title} Description: {description} Existing Image Prompt: {image_prompt} Story Tone: {tone} Setting: {setting} Focus on: - Motion of characters/objects - Camera movement (pan, zoom, dolly, orbit) - Atmosphere, lighting, and emotion - Timing cues appropriate for a {tone or "story"} scene Respond with JSON: {{"animation_prompt": ""}} """ try: response = llm_text_gen( prompt=prompt.strip(), system_prompt=system_prompt, user_id=user_id, json_struct={ "type": "object", "properties": { "animation_prompt": { "type": "string", "description": "A cinematic motion prompt for the WaveSpeed image-to-video model.", } }, "required": ["animation_prompt"], }, ) structured = _load_llm_json_response(response) animation_prompt = structured.get("animation_prompt") if not animation_prompt or not isinstance(animation_prompt, str): raise ValueError("Missing animation_prompt in structured response") cleaned_prompt = animation_prompt.strip() if not cleaned_prompt: raise ValueError("animation_prompt is empty after trimming") return cleaned_prompt except HTTPException as exc: if exc.status_code == 429: raise logger.warning( "[AnimateScene] Structured LLM prompt generation failed (%s). Falling back to text parsing.", exc.detail, ) return _generate_text_prompt( prompt=prompt, system_prompt=system_prompt, user_id=user_id, fallback_prompt=fallback_prompt, ) except (json.JSONDecodeError, ValueError, KeyError) as exc: logger.warning( "[AnimateScene] Failed to parse structured animation prompt (%s). Falling back to text parsing.", exc, ) return _generate_text_prompt( prompt=prompt, system_prompt=system_prompt, user_id=user_id, fallback_prompt=fallback_prompt, ) except Exception as exc: logger.error( "[AnimateScene] Unexpected error generating animation prompt: %s", exc, exc_info=True, ) return fallback_prompt def animate_scene_image( *, image_bytes: bytes, scene_data: Dict[str, Any], story_context: Dict[str, Any], user_id: str, duration: int = 5, guidance_scale: float = 0.5, negative_prompt: Optional[str] = None, client: Optional[WaveSpeedClient] = None, ) -> Dict[str, Any]: """ Animate a scene image using WaveSpeed Kling v2.5 Turbo Std. Returns dict with video bytes, prompt used, model name, duration, and cost. """ if duration not in (5, 10): raise HTTPException(status_code=400, detail="Duration must be 5 or 10 seconds for scene animation.") if len(image_bytes) > MAX_IMAGE_BYTES: raise HTTPException( status_code=400, detail="Scene image exceeds 10MB limit required by WaveSpeed." ) guidance_scale = max(0.0, min(1.0, guidance_scale)) animation_prompt = generate_animation_prompt(scene_data, story_context, user_id) image_b64 = base64.b64encode(image_bytes).decode("utf-8") payload = { "duration": duration, "guidance_scale": guidance_scale, "image": image_b64, "prompt": animation_prompt, } if negative_prompt: payload["negative_prompt"] = negative_prompt.strip() client = client or WaveSpeedClient() prediction_id = client.submit_image_to_video(KLING_MODEL_PATH, payload) try: result = client.poll_until_complete(prediction_id, timeout_seconds=240, interval_seconds=1.0) except HTTPException as exc: detail = exc.detail or {} if isinstance(detail, dict): detail.setdefault("prediction_id", prediction_id) detail.setdefault("resume_available", True) detail.setdefault("message", "WaveSpeed request is still processing. Use resume endpoint to fetch the video once ready.") raise HTTPException(status_code=exc.status_code, detail=detail) outputs = result.get("outputs") or [] if not outputs: raise HTTPException(status_code=502, detail="WaveSpeed completed but returned no outputs.") video_url = outputs[0] video_response = requests.get(video_url, timeout=60) if video_response.status_code != 200: raise HTTPException( status_code=502, detail={ "error": "Failed to download animation video", "status_code": video_response.status_code, "response": video_response.text[:200], }, ) model_name = KLING_MODEL_5S if duration == 5 else KLING_MODEL_10S cost = 0.21 if duration == 5 else 0.42 return { "video_bytes": video_response.content, "prompt": animation_prompt, "duration": duration, "model_name": model_name, "cost": cost, "provider": "wavespeed", "source_video_url": video_url, "prediction_id": prediction_id, } def resume_scene_animation( *, prediction_id: str, duration: int, user_id: str, client: Optional[WaveSpeedClient] = None, ) -> Dict[str, Any]: """ Resume a previously submitted animation by fetching the completed result. """ if duration not in (5, 10): raise HTTPException(status_code=400, detail="Duration must be 5 or 10 seconds for scene animation.") client = client or WaveSpeedClient() result = client.get_prediction_result(prediction_id, timeout=120) status = result.get("status") if status != "completed": raise HTTPException( status_code=409, detail={ "error": "WaveSpeed prediction is not completed yet", "prediction_id": prediction_id, "status": status, }, ) outputs = result.get("outputs") or [] if not outputs: raise HTTPException(status_code=502, detail="WaveSpeed completed but returned no outputs.") video_url = outputs[0] video_response = requests.get(video_url, timeout=120) if video_response.status_code != 200: raise HTTPException( status_code=502, detail={ "error": "Failed to download animation video during resume", "status_code": video_response.status_code, "response": video_response.text[:200], "prediction_id": prediction_id, }, ) animation_prompt = result.get("prompt") or "" model_name = KLING_MODEL_5S if duration == 5 else KLING_MODEL_10S cost = 0.21 if duration == 5 else 0.42 logger.info("[AnimateScene] Resumed download for prediction=%s", prediction_id) return { "video_bytes": video_response.content, "prompt": animation_prompt, "duration": duration, "model_name": model_name, "cost": cost, "provider": "wavespeed", "source_video_url": video_url, "prediction_id": prediction_id, }