361 lines
12 KiB
Python
361 lines
12 KiB
Python
from __future__ import annotations
|
|
|
|
import base64
|
|
import json
|
|
from typing import Any, Dict, Optional
|
|
|
|
import requests
|
|
from fastapi import HTTPException
|
|
|
|
from services.llm_providers.main_text_generation import llm_text_gen
|
|
from utils.logger_utils import get_service_logger
|
|
|
|
from .client import WaveSpeedClient
|
|
|
|
try:
|
|
import imghdr
|
|
except ModuleNotFoundError: # Python 3.13 removed imghdr
|
|
imghdr = None
|
|
|
|
logger = get_service_logger("wavespeed.kling_animation")
|
|
|
|
KLING_MODEL_PATH = "kwaivgi/kling-v2.5-turbo-std/image-to-video"
|
|
KLING_MODEL_5S = "kling-v2.5-turbo-std-5s"
|
|
KLING_MODEL_10S = "kling-v2.5-turbo-std-10s"
|
|
MAX_IMAGE_BYTES = 10 * 1024 * 1024 # 10 MB limit per docs
|
|
|
|
|
|
def _detect_image_mime(image_bytes: bytes) -> str:
|
|
if imghdr:
|
|
detected = imghdr.what(None, h=image_bytes)
|
|
if detected == "jpeg":
|
|
return "image/jpeg"
|
|
if detected == "png":
|
|
return "image/png"
|
|
if detected == "gif":
|
|
return "image/gif"
|
|
|
|
header = image_bytes[:8]
|
|
if header.startswith(b"\x89PNG"):
|
|
return "image/png"
|
|
if header[:2] == b"\xff\xd8":
|
|
return "image/jpeg"
|
|
if header[:3] in (b"GIF", b"GIF"):
|
|
return "image/gif"
|
|
|
|
return "image/png"
|
|
|
|
|
|
def _build_fallback_prompt(scene_data: Dict[str, Any], story_context: Dict[str, Any]) -> str:
|
|
title = (scene_data.get("title") or "Scene").strip()
|
|
description = (scene_data.get("description") or "").strip()
|
|
image_prompt = (scene_data.get("image_prompt") or "").strip()
|
|
tone = (story_context.get("story_tone") or "story").strip()
|
|
setting = (story_context.get("story_setting") or "the scene").strip()
|
|
|
|
parts = [
|
|
f"{title} cinematic motion shot.",
|
|
description[:220] if description else "",
|
|
f"Camera glides with subtle parallax over {setting}.",
|
|
f"Maintain a {tone} mood with natural lighting accents.",
|
|
f"Honor the original illustration details: {image_prompt[:200]}." if image_prompt else "",
|
|
"5-second sequence, gentle push-in, flowing cloth and atmospheric particles.",
|
|
]
|
|
fallback_prompt = " ".join(filter(None, parts))
|
|
return fallback_prompt.strip()
|
|
|
|
|
|
def _load_llm_json_response(response_text: Any) -> Dict[str, Any]:
|
|
"""Normalize responses from llm_text_gen (dict or JSON string)."""
|
|
if isinstance(response_text, dict):
|
|
return response_text
|
|
if isinstance(response_text, str):
|
|
return json.loads(response_text)
|
|
raise ValueError(f"Unexpected response type: {type(response_text)}")
|
|
|
|
|
|
def _generate_text_prompt(
|
|
*,
|
|
prompt: str,
|
|
system_prompt: str,
|
|
user_id: str,
|
|
fallback_prompt: str,
|
|
) -> str:
|
|
"""Fallback text generation when structured JSON parsing fails."""
|
|
try:
|
|
response = llm_text_gen(
|
|
prompt=prompt.strip(),
|
|
system_prompt=system_prompt,
|
|
user_id=user_id,
|
|
)
|
|
except HTTPException as exc:
|
|
if exc.status_code == 429:
|
|
raise
|
|
logger.warning(
|
|
"[AnimateScene] Text-mode prompt generation failed (%s). Using deterministic fallback.",
|
|
exc.detail,
|
|
)
|
|
return fallback_prompt
|
|
except Exception as exc:
|
|
logger.error(
|
|
"[AnimateScene] Unexpected error generating text prompt: %s",
|
|
exc,
|
|
exc_info=True,
|
|
)
|
|
return fallback_prompt
|
|
|
|
if isinstance(response, dict):
|
|
candidates = [
|
|
response.get("animation_prompt"),
|
|
response.get("prompt"),
|
|
response.get("text"),
|
|
]
|
|
for candidate in candidates:
|
|
if isinstance(candidate, str) and candidate.strip():
|
|
return candidate.strip()
|
|
# As a last resort, stringify the dict
|
|
response_text = json.dumps(response, ensure_ascii=False)
|
|
else:
|
|
response_text = str(response)
|
|
|
|
cleaned = response_text.strip()
|
|
return cleaned or fallback_prompt
|
|
|
|
|
|
def generate_animation_prompt(
|
|
scene_data: Dict[str, Any],
|
|
story_context: Dict[str, Any],
|
|
user_id: str,
|
|
) -> str:
|
|
"""
|
|
Generate an animation-focused prompt using llm_text_gen, falling back to a deterministic prompt if LLM fails.
|
|
"""
|
|
fallback_prompt = _build_fallback_prompt(scene_data, story_context)
|
|
system_prompt = (
|
|
"You are an expert cinematic animation director. "
|
|
"You transform static illustrated scenes into short cinematic motion clips. "
|
|
"Describe motion, camera behavior, atmosphere, and pacing."
|
|
)
|
|
|
|
description = scene_data.get("description", "")
|
|
image_prompt = scene_data.get("image_prompt", "")
|
|
title = scene_data.get("title", "")
|
|
tone = story_context.get("story_tone") or story_context.get("story_tone", "")
|
|
setting = story_context.get("story_setting") or story_context.get("story_setting", "")
|
|
|
|
prompt = f"""
|
|
Create a concise animation prompt (2-3 sentences) for a 5-second cinematic clip.
|
|
|
|
Scene Title: {title}
|
|
Description: {description}
|
|
Existing Image Prompt: {image_prompt}
|
|
Story Tone: {tone}
|
|
Setting: {setting}
|
|
|
|
Focus on:
|
|
- Motion of characters/objects
|
|
- Camera movement (pan, zoom, dolly, orbit)
|
|
- Atmosphere, lighting, and emotion
|
|
- Timing cues appropriate for a {tone or "story"} scene
|
|
|
|
Respond with JSON: {{"animation_prompt": "<prompt>"}}
|
|
"""
|
|
|
|
try:
|
|
response = llm_text_gen(
|
|
prompt=prompt.strip(),
|
|
system_prompt=system_prompt,
|
|
user_id=user_id,
|
|
json_struct={
|
|
"type": "object",
|
|
"properties": {
|
|
"animation_prompt": {
|
|
"type": "string",
|
|
"description": "A cinematic motion prompt for the WaveSpeed image-to-video model.",
|
|
}
|
|
},
|
|
"required": ["animation_prompt"],
|
|
},
|
|
)
|
|
structured = _load_llm_json_response(response)
|
|
animation_prompt = structured.get("animation_prompt")
|
|
if not animation_prompt or not isinstance(animation_prompt, str):
|
|
raise ValueError("Missing animation_prompt in structured response")
|
|
cleaned_prompt = animation_prompt.strip()
|
|
if not cleaned_prompt:
|
|
raise ValueError("animation_prompt is empty after trimming")
|
|
return cleaned_prompt
|
|
except HTTPException as exc:
|
|
if exc.status_code == 429:
|
|
raise
|
|
logger.warning(
|
|
"[AnimateScene] Structured LLM prompt generation failed (%s). Falling back to text parsing.",
|
|
exc.detail,
|
|
)
|
|
return _generate_text_prompt(
|
|
prompt=prompt,
|
|
system_prompt=system_prompt,
|
|
user_id=user_id,
|
|
fallback_prompt=fallback_prompt,
|
|
)
|
|
except (json.JSONDecodeError, ValueError, KeyError) as exc:
|
|
logger.warning(
|
|
"[AnimateScene] Failed to parse structured animation prompt (%s). Falling back to text parsing.",
|
|
exc,
|
|
)
|
|
return _generate_text_prompt(
|
|
prompt=prompt,
|
|
system_prompt=system_prompt,
|
|
user_id=user_id,
|
|
fallback_prompt=fallback_prompt,
|
|
)
|
|
except Exception as exc:
|
|
logger.error(
|
|
"[AnimateScene] Unexpected error generating animation prompt: %s",
|
|
exc,
|
|
exc_info=True,
|
|
)
|
|
return fallback_prompt
|
|
|
|
|
|
def animate_scene_image(
|
|
*,
|
|
image_bytes: bytes,
|
|
scene_data: Dict[str, Any],
|
|
story_context: Dict[str, Any],
|
|
user_id: str,
|
|
duration: int = 5,
|
|
guidance_scale: float = 0.5,
|
|
negative_prompt: Optional[str] = None,
|
|
client: Optional[WaveSpeedClient] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Animate a scene image using WaveSpeed Kling v2.5 Turbo Std.
|
|
Returns dict with video bytes, prompt used, model name, duration, and cost.
|
|
"""
|
|
if duration not in (5, 10):
|
|
raise HTTPException(status_code=400, detail="Duration must be 5 or 10 seconds for scene animation.")
|
|
|
|
if len(image_bytes) > MAX_IMAGE_BYTES:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Scene image exceeds 10MB limit required by WaveSpeed."
|
|
)
|
|
|
|
guidance_scale = max(0.0, min(1.0, guidance_scale))
|
|
animation_prompt = generate_animation_prompt(scene_data, story_context, user_id)
|
|
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
|
|
|
|
payload = {
|
|
"duration": duration,
|
|
"guidance_scale": guidance_scale,
|
|
"image": image_b64,
|
|
"prompt": animation_prompt,
|
|
}
|
|
if negative_prompt:
|
|
payload["negative_prompt"] = negative_prompt.strip()
|
|
|
|
client = client or WaveSpeedClient()
|
|
prediction_id = client.submit_image_to_video(KLING_MODEL_PATH, payload)
|
|
try:
|
|
result = client.poll_until_complete(prediction_id, timeout_seconds=240, interval_seconds=1.0)
|
|
except HTTPException as exc:
|
|
detail = exc.detail or {}
|
|
if isinstance(detail, dict):
|
|
detail.setdefault("prediction_id", prediction_id)
|
|
detail.setdefault("resume_available", True)
|
|
detail.setdefault("message", "WaveSpeed request is still processing. Use resume endpoint to fetch the video once ready.")
|
|
raise HTTPException(status_code=exc.status_code, detail=detail)
|
|
|
|
outputs = result.get("outputs") or []
|
|
if not outputs:
|
|
raise HTTPException(status_code=502, detail="WaveSpeed completed but returned no outputs.")
|
|
|
|
video_url = outputs[0]
|
|
video_response = requests.get(video_url, timeout=60)
|
|
if video_response.status_code != 200:
|
|
raise HTTPException(
|
|
status_code=502,
|
|
detail={
|
|
"error": "Failed to download animation video",
|
|
"status_code": video_response.status_code,
|
|
"response": video_response.text[:200],
|
|
},
|
|
)
|
|
|
|
model_name = KLING_MODEL_5S if duration == 5 else KLING_MODEL_10S
|
|
cost = 0.21 if duration == 5 else 0.42
|
|
|
|
return {
|
|
"video_bytes": video_response.content,
|
|
"prompt": animation_prompt,
|
|
"duration": duration,
|
|
"model_name": model_name,
|
|
"cost": cost,
|
|
"provider": "wavespeed",
|
|
"source_video_url": video_url,
|
|
"prediction_id": prediction_id,
|
|
}
|
|
|
|
|
|
def resume_scene_animation(
|
|
*,
|
|
prediction_id: str,
|
|
duration: int,
|
|
user_id: str,
|
|
client: Optional[WaveSpeedClient] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Resume a previously submitted animation by fetching the completed result.
|
|
"""
|
|
if duration not in (5, 10):
|
|
raise HTTPException(status_code=400, detail="Duration must be 5 or 10 seconds for scene animation.")
|
|
|
|
client = client or WaveSpeedClient()
|
|
result = client.get_prediction_result(prediction_id, timeout=120)
|
|
status = result.get("status")
|
|
if status != "completed":
|
|
raise HTTPException(
|
|
status_code=409,
|
|
detail={
|
|
"error": "WaveSpeed prediction is not completed yet",
|
|
"prediction_id": prediction_id,
|
|
"status": status,
|
|
},
|
|
)
|
|
|
|
outputs = result.get("outputs") or []
|
|
if not outputs:
|
|
raise HTTPException(status_code=502, detail="WaveSpeed completed but returned no outputs.")
|
|
|
|
video_url = outputs[0]
|
|
video_response = requests.get(video_url, timeout=120)
|
|
if video_response.status_code != 200:
|
|
raise HTTPException(
|
|
status_code=502,
|
|
detail={
|
|
"error": "Failed to download animation video during resume",
|
|
"status_code": video_response.status_code,
|
|
"response": video_response.text[:200],
|
|
"prediction_id": prediction_id,
|
|
},
|
|
)
|
|
|
|
animation_prompt = result.get("prompt") or ""
|
|
model_name = KLING_MODEL_5S if duration == 5 else KLING_MODEL_10S
|
|
cost = 0.21 if duration == 5 else 0.42
|
|
|
|
logger.info("[AnimateScene] Resumed download for prediction=%s", prediction_id)
|
|
|
|
return {
|
|
"video_bytes": video_response.content,
|
|
"prompt": animation_prompt,
|
|
"duration": duration,
|
|
"model_name": model_name,
|
|
"cost": cost,
|
|
"provider": "wavespeed",
|
|
"source_video_url": video_url,
|
|
"prediction_id": prediction_id,
|
|
}
|
|
|