Base code

2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions
--- a/backend/services/wavespeed/kling_animation.py
+++ b/backend/services/wavespeed/kling_animation.py
@@ -0,0 +1,360 @@
+from __future__ import annotations
+
+import base64
+import json
+from typing import Any, Dict, Optional
+
+import requests
+from fastapi import HTTPException
+
+from services.llm_providers.main_text_generation import llm_text_gen
+from utils.logger_utils import get_service_logger
+
+from .client import WaveSpeedClient
+
+try:
+    import imghdr
+except ModuleNotFoundError:  # Python 3.13 removed imghdr
+    imghdr = None
+
+logger = get_service_logger("wavespeed.kling_animation")
+
+KLING_MODEL_PATH = "kwaivgi/kling-v2.5-turbo-std/image-to-video"
+KLING_MODEL_5S = "kling-v2.5-turbo-std-5s"
+KLING_MODEL_10S = "kling-v2.5-turbo-std-10s"
+MAX_IMAGE_BYTES = 10 * 1024 * 1024  # 10 MB limit per docs
+
+
+def _detect_image_mime(image_bytes: bytes) -> str:
+    if imghdr:
+        detected = imghdr.what(None, h=image_bytes)
+        if detected == "jpeg":
+            return "image/jpeg"
+        if detected == "png":
+            return "image/png"
+        if detected == "gif":
+            return "image/gif"
+
+    header = image_bytes[:8]
+    if header.startswith(b"\x89PNG"):
+        return "image/png"
+    if header[:2] == b"\xff\xd8":
+        return "image/jpeg"
+    if header[:3] in (b"GIF", b"GIF"):
+        return "image/gif"
+
+    return "image/png"
+
+
+def _build_fallback_prompt(scene_data: Dict[str, Any], story_context: Dict[str, Any]) -> str:
+    title = (scene_data.get("title") or "Scene").strip()
+    description = (scene_data.get("description") or "").strip()
+    image_prompt = (scene_data.get("image_prompt") or "").strip()
+    tone = (story_context.get("story_tone") or "story").strip()
+    setting = (story_context.get("story_setting") or "the scene").strip()
+
+    parts = [
+        f"{title} cinematic motion shot.",
+        description[:220] if description else "",
+        f"Camera glides with subtle parallax over {setting}.",
+        f"Maintain a {tone} mood with natural lighting accents.",
+        f"Honor the original illustration details: {image_prompt[:200]}." if image_prompt else "",
+        "5-second sequence, gentle push-in, flowing cloth and atmospheric particles.",
+    ]
+    fallback_prompt = " ".join(filter(None, parts))
+    return fallback_prompt.strip()
+
+
+def _load_llm_json_response(response_text: Any) -> Dict[str, Any]:
+    """Normalize responses from llm_text_gen (dict or JSON string)."""
+    if isinstance(response_text, dict):
+        return response_text
+    if isinstance(response_text, str):
+        return json.loads(response_text)
+    raise ValueError(f"Unexpected response type: {type(response_text)}")
+
+
+def _generate_text_prompt(
+    *,
+    prompt: str,
+    system_prompt: str,
+    user_id: str,
+    fallback_prompt: str,
+) -> str:
+    """Fallback text generation when structured JSON parsing fails."""
+    try:
+        response = llm_text_gen(
+            prompt=prompt.strip(),
+            system_prompt=system_prompt,
+            user_id=user_id,
+        )
+    except HTTPException as exc:
+        if exc.status_code == 429:
+            raise
+        logger.warning(
+            "[AnimateScene] Text-mode prompt generation failed (%s). Using deterministic fallback.",
+            exc.detail,
+        )
+        return fallback_prompt
+    except Exception as exc:
+        logger.error(
+            "[AnimateScene] Unexpected error generating text prompt: %s",
+            exc,
+            exc_info=True,
+        )
+        return fallback_prompt
+
+    if isinstance(response, dict):
+        candidates = [
+            response.get("animation_prompt"),
+            response.get("prompt"),
+            response.get("text"),
+        ]
+        for candidate in candidates:
+            if isinstance(candidate, str) and candidate.strip():
+                return candidate.strip()
+        # As a last resort, stringify the dict
+        response_text = json.dumps(response, ensure_ascii=False)
+    else:
+        response_text = str(response)
+
+    cleaned = response_text.strip()
+    return cleaned or fallback_prompt
+
+
+def generate_animation_prompt(
+    scene_data: Dict[str, Any],
+    story_context: Dict[str, Any],
+    user_id: str,
+) -> str:
+    """
+    Generate an animation-focused prompt using llm_text_gen, falling back to a deterministic prompt if LLM fails.
+    """
+    fallback_prompt = _build_fallback_prompt(scene_data, story_context)
+    system_prompt = (
+        "You are an expert cinematic animation director. "
+        "You transform static illustrated scenes into short cinematic motion clips. "
+        "Describe motion, camera behavior, atmosphere, and pacing."
+    )
+
+    description = scene_data.get("description", "")
+    image_prompt = scene_data.get("image_prompt", "")
+    title = scene_data.get("title", "")
+    tone = story_context.get("story_tone") or story_context.get("story_tone", "")
+    setting = story_context.get("story_setting") or story_context.get("story_setting", "")
+
+    prompt = f"""
+Create a concise animation prompt (2-3 sentences) for a 5-second cinematic clip.
+
+Scene Title: {title}
+Description: {description}
+Existing Image Prompt: {image_prompt}
+Story Tone: {tone}
+Setting: {setting}
+
+Focus on:
+- Motion of characters/objects
+- Camera movement (pan, zoom, dolly, orbit)
+- Atmosphere, lighting, and emotion
+- Timing cues appropriate for a {tone or "story"} scene
+
+Respond with JSON: {{"animation_prompt": "<prompt>"}}
+"""
+
+    try:
+        response = llm_text_gen(
+            prompt=prompt.strip(),
+            system_prompt=system_prompt,
+            user_id=user_id,
+            json_struct={
+                "type": "object",
+                "properties": {
+                    "animation_prompt": {
+                        "type": "string",
+                        "description": "A cinematic motion prompt for the WaveSpeed image-to-video model.",
+                    }
+                },
+                "required": ["animation_prompt"],
+            },
+        )
+        structured = _load_llm_json_response(response)
+        animation_prompt = structured.get("animation_prompt")
+        if not animation_prompt or not isinstance(animation_prompt, str):
+            raise ValueError("Missing animation_prompt in structured response")
+        cleaned_prompt = animation_prompt.strip()
+        if not cleaned_prompt:
+            raise ValueError("animation_prompt is empty after trimming")
+        return cleaned_prompt
+    except HTTPException as exc:
+        if exc.status_code == 429:
+            raise
+        logger.warning(
+            "[AnimateScene] Structured LLM prompt generation failed (%s). Falling back to text parsing.",
+            exc.detail,
+        )
+        return _generate_text_prompt(
+            prompt=prompt,
+            system_prompt=system_prompt,
+            user_id=user_id,
+            fallback_prompt=fallback_prompt,
+        )
+    except (json.JSONDecodeError, ValueError, KeyError) as exc:
+        logger.warning(
+            "[AnimateScene] Failed to parse structured animation prompt (%s). Falling back to text parsing.",
+            exc,
+        )
+        return _generate_text_prompt(
+            prompt=prompt,
+            system_prompt=system_prompt,
+            user_id=user_id,
+            fallback_prompt=fallback_prompt,
+        )
+    except Exception as exc:
+        logger.error(
+            "[AnimateScene] Unexpected error generating animation prompt: %s",
+            exc,
+            exc_info=True,
+        )
+        return fallback_prompt
+
+
+def animate_scene_image(
+    *,
+    image_bytes: bytes,
+    scene_data: Dict[str, Any],
+    story_context: Dict[str, Any],
+    user_id: str,
+    duration: int = 5,
+    guidance_scale: float = 0.5,
+    negative_prompt: Optional[str] = None,
+    client: Optional[WaveSpeedClient] = None,
+) -> Dict[str, Any]:
+    """
+    Animate a scene image using WaveSpeed Kling v2.5 Turbo Std.
+    Returns dict with video bytes, prompt used, model name, duration, and cost.
+    """
+    if duration not in (5, 10):
+        raise HTTPException(status_code=400, detail="Duration must be 5 or 10 seconds for scene animation.")
+
+    if len(image_bytes) > MAX_IMAGE_BYTES:
+        raise HTTPException(
+            status_code=400,
+            detail="Scene image exceeds 10MB limit required by WaveSpeed."
+        )
+
+    guidance_scale = max(0.0, min(1.0, guidance_scale))
+    animation_prompt = generate_animation_prompt(scene_data, story_context, user_id)
+    image_b64 = base64.b64encode(image_bytes).decode("utf-8")
+
+    payload = {
+        "duration": duration,
+        "guidance_scale": guidance_scale,
+        "image": image_b64,
+        "prompt": animation_prompt,
+    }
+    if negative_prompt:
+        payload["negative_prompt"] = negative_prompt.strip()
+
+    client = client or WaveSpeedClient()
+    prediction_id = client.submit_image_to_video(KLING_MODEL_PATH, payload)
+    try:
+        result = client.poll_until_complete(prediction_id, timeout_seconds=240, interval_seconds=1.0)
+    except HTTPException as exc:
+        detail = exc.detail or {}
+        if isinstance(detail, dict):
+            detail.setdefault("prediction_id", prediction_id)
+            detail.setdefault("resume_available", True)
+            detail.setdefault("message", "WaveSpeed request is still processing. Use resume endpoint to fetch the video once ready.")
+        raise HTTPException(status_code=exc.status_code, detail=detail)
+
+    outputs = result.get("outputs") or []
+    if not outputs:
+        raise HTTPException(status_code=502, detail="WaveSpeed completed but returned no outputs.")
+
+    video_url = outputs[0]
+    video_response = requests.get(video_url, timeout=60)
+    if video_response.status_code != 200:
+        raise HTTPException(
+            status_code=502,
+            detail={
+                "error": "Failed to download animation video",
+                "status_code": video_response.status_code,
+                "response": video_response.text[:200],
+            },
+        )
+
+    model_name = KLING_MODEL_5S if duration == 5 else KLING_MODEL_10S
+    cost = 0.21 if duration == 5 else 0.42
+
+    return {
+        "video_bytes": video_response.content,
+        "prompt": animation_prompt,
+        "duration": duration,
+        "model_name": model_name,
+        "cost": cost,
+        "provider": "wavespeed",
+        "source_video_url": video_url,
+        "prediction_id": prediction_id,
+    }
+
+
+def resume_scene_animation(
+    *,
+    prediction_id: str,
+    duration: int,
+    user_id: str,
+    client: Optional[WaveSpeedClient] = None,
+) -> Dict[str, Any]:
+    """
+    Resume a previously submitted animation by fetching the completed result.
+    """
+    if duration not in (5, 10):
+        raise HTTPException(status_code=400, detail="Duration must be 5 or 10 seconds for scene animation.")
+
+    client = client or WaveSpeedClient()
+    result = client.get_prediction_result(prediction_id, timeout=120)
+    status = result.get("status")
+    if status != "completed":
+        raise HTTPException(
+            status_code=409,
+            detail={
+                "error": "WaveSpeed prediction is not completed yet",
+                "prediction_id": prediction_id,
+                "status": status,
+            },
+        )
+
+    outputs = result.get("outputs") or []
+    if not outputs:
+        raise HTTPException(status_code=502, detail="WaveSpeed completed but returned no outputs.")
+
+    video_url = outputs[0]
+    video_response = requests.get(video_url, timeout=120)
+    if video_response.status_code != 200:
+        raise HTTPException(
+            status_code=502,
+            detail={
+                "error": "Failed to download animation video during resume",
+                "status_code": video_response.status_code,
+                "response": video_response.text[:200],
+                "prediction_id": prediction_id,
+            },
+        )
+
+    animation_prompt = result.get("prompt") or ""
+    model_name = KLING_MODEL_5S if duration == 5 else KLING_MODEL_10S
+    cost = 0.21 if duration == 5 else 0.42
+
+    logger.info("[AnimateScene] Resumed download for prediction=%s", prediction_id)
+
+    return {
+        "video_bytes": video_response.content,
+        "prompt": animation_prompt,
+        "duration": duration,
+        "model_name": model_name,
+        "cost": cost,
+        "provider": "wavespeed",
+        "source_video_url": video_url,
+        "prediction_id": prediction_id,
+    }
+