moreminimore-marketing/backend/services/wavespeed/hunyuan_avatar.py

"""
Hunyuan Avatar Service

Service for creating talking avatars using Hunyuan Avatar model.
Reference: https://wavespeed.ai/models/wavespeed-ai/hunyuan-avatar
"""

from __future__ import annotations

import base64
from typing import Any, Dict, Optional

import requests
from fastapi import HTTPException
from loguru import logger

from .client import WaveSpeedClient

HUNYUAN_AVATAR_MODEL_PATH = "wavespeed-ai/hunyuan-avatar"
HUNYUAN_AVATAR_MODEL_NAME = "wavespeed-ai/hunyuan-avatar"
MAX_IMAGE_BYTES = 10 * 1024 * 1024  # 10MB
MAX_AUDIO_BYTES = 50 * 1024 * 1024  # 50MB safety cap
MAX_DURATION_SECONDS = 120  # 2 minutes maximum
MIN_DURATION_SECONDS = 5  # Minimum billable duration


def _as_data_uri(content_bytes: bytes, mime_type: str) -> str:
    """Convert bytes to data URI."""
    encoded = base64.b64encode(content_bytes).decode("utf-8")
    return f"data:{mime_type};base64,{encoded}"


def calculate_hunyuan_avatar_cost(resolution: str, duration: float) -> float:
    """
    Calculate cost for Hunyuan Avatar video.

    Pricing:
    - 480p: $0.15 per 5 seconds
    - 720p: $0.30 per 5 seconds
    - Minimum charge: 5 seconds
    - Maximum billable: 120 seconds

    Args:
        resolution: Output resolution (480p or 720p)
        duration: Video duration in seconds

    Returns:
        Cost in USD
    """
    # Clamp duration to valid range
    actual_duration = max(MIN_DURATION_SECONDS, min(duration, MAX_DURATION_SECONDS))

    # Calculate cost per 5 seconds
    cost_per_5_seconds = 0.15 if resolution == "480p" else 0.30

    # Round up to nearest 5 seconds
    billable_5_second_blocks = (actual_duration + 4) // 5  # Ceiling division

    return cost_per_5_seconds * billable_5_second_blocks


def create_hunyuan_avatar(
    *,
    image_bytes: bytes,
    audio_bytes: bytes,
    resolution: str = "480p",
    prompt: Optional[str] = None,
    seed: Optional[int] = None,
    user_id: str = "video_studio",
    image_mime: str = "image/png",
    audio_mime: str = "audio/mpeg",
    client: Optional[WaveSpeedClient] = None,
    progress_callback: Optional[callable] = None,
) -> Dict[str, Any]:
    """
    Create talking avatar video using Hunyuan Avatar.

    Reference: https://wavespeed.ai/docs/docs-api/wavespeed-ai/hunyuan-avatar

    Args:
        image_bytes: Portrait image as bytes
        audio_bytes: Audio file as bytes
        resolution: Output resolution (480p or 720p, default: 480p)
        prompt: Optional text to guide expression or style
        seed: Optional random seed (-1 for random)
        user_id: User ID for tracking
        image_mime: MIME type of image
        audio_mime: MIME type of audio
        client: Optional WaveSpeedClient instance
        progress_callback: Optional progress callback function

    Returns:
        Dictionary with video_bytes, prompt, duration, model_name, cost, etc.
    """
    if not image_bytes:
        raise HTTPException(status_code=400, detail="Image bytes are required for Hunyuan Avatar.")
    if not audio_bytes:
        raise HTTPException(status_code=400, detail="Audio bytes are required for Hunyuan Avatar.")

    if len(image_bytes) > MAX_IMAGE_BYTES:
        raise HTTPException(
            status_code=400,
            detail=f"Image exceeds {MAX_IMAGE_BYTES / (1024 * 1024):.0f}MB limit required by Hunyuan Avatar.",
        )
    if len(audio_bytes) > MAX_AUDIO_BYTES:
        raise HTTPException(
            status_code=400,
            detail=f"Audio exceeds {MAX_AUDIO_BYTES / (1024 * 1024):.0f}MB limit allowed for Hunyuan Avatar requests.",
        )

    if resolution not in {"480p", "720p"}:
        raise HTTPException(status_code=400, detail="Resolution must be '480p' or '720p'.")

    # Build payload
    payload: Dict[str, Any] = {
        "image": _as_data_uri(image_bytes, image_mime),
        "audio": _as_data_uri(audio_bytes, audio_mime),
        "resolution": resolution,
    }

    if prompt:
        payload["prompt"] = prompt.strip()
    if seed is not None:
        payload["seed"] = seed

    client = client or WaveSpeedClient()

    # Progress callback: submission
    if progress_callback:
        progress_callback(10.0, "Submitting Hunyuan Avatar request to WaveSpeed...")

    prediction_id = client.submit_image_to_video(HUNYUAN_AVATAR_MODEL_PATH, payload, timeout=60)

    try:
        # Poll for completion
        if progress_callback:
            progress_callback(20.0, f"Polling for completion (prediction_id: {prediction_id})...")

        result = client.poll_until_complete(
            prediction_id,
            timeout_seconds=600,  # 10 minutes max
            interval_seconds=0.5,  # Poll every 0.5 seconds
            progress_callback=progress_callback,
        )
    except HTTPException as exc:
        detail = exc.detail or {}
        if isinstance(detail, dict):
            detail.setdefault("prediction_id", prediction_id)
            detail.setdefault("resume_available", True)
        raise

    outputs = result.get("outputs") or []
    if not outputs:
        raise HTTPException(
            status_code=502,
            detail={
                "error": "Hunyuan Avatar completed but returned no outputs",
                "prediction_id": prediction_id,
            }
        )

    video_url = outputs[0]
    if not isinstance(video_url, str) or not video_url.startswith("http"):
        raise HTTPException(
            status_code=502,
            detail={
                "error": f"Invalid video URL format: {video_url}",
                "prediction_id": prediction_id,
            }
        )

    # Progress callback: downloading video
    if progress_callback:
        progress_callback(90.0, "Downloading generated video...")

    # Download video
    try:
        video_response = requests.get(video_url, timeout=180)
        if video_response.status_code != 200:
            raise HTTPException(
                status_code=502,
                detail={
                    "error": "Failed to download Hunyuan Avatar video",
                    "status_code": video_response.status_code,
                    "response": video_response.text[:200],
                    "prediction_id": prediction_id,
                }
            )
    except requests.exceptions.RequestException as e:
        raise HTTPException(
            status_code=502,
            detail={
                "error": f"Failed to download video: {str(e)}",
                "prediction_id": prediction_id,
            }
        )

    video_bytes = video_response.content
    if len(video_bytes) == 0:
        raise HTTPException(
            status_code=502,
            detail={
                "error": "Downloaded video is empty",
                "prediction_id": prediction_id,
            }
        )

    # Estimate duration (we don't get exact duration from API, so estimate from audio or use default)
    # For now, we'll use a default estimate - in production, you might want to analyze the audio file
    estimated_duration = 10.0  # Default estimate

    # Calculate cost
    cost = calculate_hunyuan_avatar_cost(resolution, estimated_duration)

    # Get video dimensions from resolution
    resolution_dims = {
        "480p": (854, 480),
        "720p": (1280, 720),
    }
    width, height = resolution_dims.get(resolution, (854, 480))

    # Extract metadata
    metadata = result.get("metadata", {})
    metadata.update({
        "has_nsfw_contents": result.get("has_nsfw_contents", []),
        "created_at": result.get("created_at"),
        "resolution": resolution,
        "max_duration": MAX_DURATION_SECONDS,
    })

    logger.info(
        f"[Hunyuan Avatar] ✅ Generated video: {len(video_bytes)} bytes, "
        f"resolution={resolution}, cost=${cost:.2f}"
    )

    # Progress callback: completed
    if progress_callback:
        progress_callback(100.0, "Avatar generation completed!")

    return {
        "video_bytes": video_bytes,
        "prompt": prompt or "",
        "duration": estimated_duration,
        "model_name": HUNYUAN_AVATAR_MODEL_NAME,
        "cost": cost,
        "provider": "wavespeed",
        "resolution": resolution,
        "width": width,
        "height": height,
        "metadata": metadata,
        "source_video_url": video_url,
        "prediction_id": prediction_id,
    }