Base code

This commit is contained in:
Kunthawat Greethong
2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,600 @@
"""
WaveSpeed AI API Client
Thin HTTP client for the WaveSpeed AI API.
Handles authentication, submission, and delegates to specialized generators.
"""
from __future__ import annotations
from typing import Any, Dict, Optional, Callable
from fastapi import HTTPException
from services.onboarding.api_key_manager import APIKeyManager
from utils.logger_utils import get_service_logger
from .polling import WaveSpeedPolling
from .generators.prompt import PromptGenerator
from .generators.image import ImageGenerator
from .generators.video import VideoGenerator
from .generators.speech import SpeechGenerator
logger = get_service_logger("wavespeed.client")
class WaveSpeedClient:
"""
Thin HTTP client for the WaveSpeed AI API.
Handles authentication, submission, and polling helpers.
"""
BASE_URL = "https://api.wavespeed.ai/api/v3"
def __init__(self, api_key: Optional[str] = None):
manager = APIKeyManager()
self.api_key = api_key or manager.get_api_key("wavespeed")
if not self.api_key:
raise RuntimeError("WAVESPEED_API_KEY is not configured. Please add it to your environment.")
# Initialize polling utilities
self.polling = WaveSpeedPolling(self.api_key, self.BASE_URL)
# Initialize generators
self.prompt = PromptGenerator(self.api_key, self.BASE_URL, self.polling)
self.image = ImageGenerator(self.api_key, self.BASE_URL, self.polling)
self.video = VideoGenerator(self.api_key, self.BASE_URL, self.polling)
self.speech = SpeechGenerator(self.api_key, self.BASE_URL, self.polling)
def _headers(self) -> Dict[str, str]:
return {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
}
# Core submission methods (delegated to video generator)
def submit_image_to_video(
self,
model_path: str,
payload: Dict[str, Any],
timeout: int = 30,
) -> str:
"""
Submit an image-to-video generation request.
Returns the prediction ID for polling.
"""
return self.video.submit_image_to_video(model_path, payload, timeout)
def submit_text_to_video(
self,
model_path: str,
payload: Dict[str, Any],
timeout: int = 60,
) -> str:
"""
Submit a text-to-video generation request to WaveSpeed.
Args:
model_path: Model path (e.g., "alibaba/wan-2.5/text-to-video")
payload: Request payload with prompt, resolution, duration, optional audio
timeout: Request timeout in seconds
Returns:
Prediction ID for polling
"""
return self.video.submit_text_to_video(model_path, payload, timeout)
# Polling methods (delegated to polling utilities)
def get_prediction_result(self, prediction_id: str, timeout: int = 30) -> Dict[str, Any]:
"""
Fetch the current status/result for a prediction.
Matches the example pattern: simple GET request, check status_code == 200, return data.
"""
return self.polling.get_prediction_result(prediction_id, timeout)
def poll_until_complete(
self,
prediction_id: str,
timeout_seconds: Optional[int] = None,
interval_seconds: float = 1.0,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> Dict[str, Any]:
"""
Poll WaveSpeed until the job completes or fails.
Matches the example pattern: simple polling loop until status is "completed" or "failed".
Args:
prediction_id: The prediction ID to poll for
timeout_seconds: Optional timeout in seconds. If None, polls indefinitely until completion/failure.
interval_seconds: Seconds to wait between polling attempts (default: 1.0, faster than 2.0)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
Dict containing the completed result
Raises:
HTTPException: If the task fails, polling fails, or times out (if timeout_seconds is set)
"""
return self.polling.poll_until_complete(
prediction_id,
timeout_seconds=timeout_seconds,
interval_seconds=interval_seconds,
progress_callback=progress_callback,
)
# Generator methods (delegated to specialized generators)
def optimize_prompt(
self,
text: str,
mode: str = "image",
style: str = "default",
image: Optional[str] = None,
enable_sync_mode: bool = True,
timeout: int = 30,
) -> str:
"""
Optimize a prompt using WaveSpeed prompt optimizer.
Args:
text: The prompt text to optimize
mode: "image" or "video" (default: "image")
style: "default", "artistic", "photographic", "technical", "anime", "realistic" (default: "default")
image: Base64-encoded image for context (optional)
enable_sync_mode: If True, wait for result and return it directly (default: True)
timeout: Request timeout in seconds (default: 30)
Returns:
Optimized prompt text
"""
return self.prompt.optimize_prompt(
text=text,
mode=mode,
style=style,
image=image,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
)
def generate_image(
self,
model: str,
prompt: str,
width: int = 1024,
height: int = 1024,
num_inference_steps: Optional[int] = None,
guidance_scale: Optional[float] = None,
negative_prompt: Optional[str] = None,
seed: Optional[int] = None,
enable_sync_mode: bool = True,
timeout: int = 120,
**kwargs
) -> bytes:
"""
Generate image using WaveSpeed AI models (Ideogram V3 or Qwen Image).
Args:
model: Model to use ("ideogram-v3-turbo" or "qwen-image")
prompt: Text prompt for image generation
width: Image width (default: 1024)
height: Image height (default: 1024)
num_inference_steps: Number of inference steps
guidance_scale: Guidance scale for generation
negative_prompt: Negative prompt (what to avoid)
seed: Random seed for reproducibility
enable_sync_mode: If True, wait for result and return it directly (default: True)
timeout: Request timeout in seconds (default: 120)
**kwargs: Additional parameters
Returns:
bytes: Generated image bytes
"""
return self.image.generate_image(
model=model,
prompt=prompt,
width=width,
height=height,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
negative_prompt=negative_prompt,
seed=seed,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
**kwargs
)
def generate_character_image(
self,
prompt: str,
reference_image_bytes: bytes,
style: str = "Auto",
aspect_ratio: str = "16:9",
rendering_speed: str = "Default",
timeout: Optional[int] = None,
) -> bytes:
"""
Generate image using Ideogram Character API to maintain character consistency.
Creates variations of a reference character image while respecting the base appearance.
Note: This API is always async and requires polling for results.
Args:
prompt: Text prompt describing the scene/context for the character
reference_image_bytes: Reference image bytes (base avatar)
style: Character style type ("Auto", "Fiction", or "Realistic")
aspect_ratio: Aspect ratio ("1:1", "16:9", "9:16", "4:3", "3:4")
rendering_speed: Rendering speed ("Default", "Turbo", "Quality")
timeout: Total timeout in seconds for submission + polling (default: 180)
Returns:
bytes: Generated image bytes with consistent character
"""
return self.image.generate_character_image(
prompt=prompt,
reference_image_bytes=reference_image_bytes,
style=style,
aspect_ratio=aspect_ratio,
rendering_speed=rendering_speed,
timeout=timeout,
)
def generate_speech(
self,
text: str,
voice_id: str,
speed: float = 1.0,
volume: float = 1.0,
pitch: float = 0.0,
emotion: str = "happy",
enable_sync_mode: bool = True,
timeout: int = 120,
**kwargs
) -> bytes:
"""
Generate speech audio using Minimax Speech 02 HD via WaveSpeed.
Args:
text: Text to convert to speech (max 10000 characters)
voice_id: Voice ID (e.g., "Wise_Woman", "Friendly_Person", etc.)
speed: Speech speed (0.5-2.0, default: 1.0)
volume: Speech volume (0.1-10.0, default: 1.0)
pitch: Speech pitch (-12 to 12, default: 0.0)
emotion: Emotion ("happy", "sad", "angry", etc., default: "happy")
enable_sync_mode: If True, wait for result and return it directly (default: True)
timeout: Request timeout in seconds (default: 60)
**kwargs: Additional parameters (sample_rate, bitrate, format, etc.)
Returns:
bytes: Generated audio bytes
"""
return self.speech.generate_speech(
text=text,
voice_id=voice_id,
speed=speed,
volume=volume,
pitch=pitch,
emotion=emotion,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
**kwargs
)
def generate_text_video(
self,
prompt: str,
resolution: str = "720p", # 480p, 720p, 1080p
duration: int = 5, # 5 or 10 seconds
audio_base64: Optional[str] = None, # Optional audio for lip-sync
negative_prompt: Optional[str] = None,
seed: Optional[int] = None,
enable_prompt_expansion: bool = True,
enable_sync_mode: bool = False,
timeout: int = 180,
) -> Dict[str, Any]:
"""
Generate video from text prompt using WAN 2.5 text-to-video.
Args:
prompt: Text prompt describing the video
resolution: Output resolution (480p, 720p, 1080p)
duration: Video duration in seconds (5 or 10)
audio_base64: Optional audio file (wav/mp3, 3-30s, ≤15MB) for lip-sync
negative_prompt: Optional negative prompt
seed: Optional random seed for reproducibility
enable_prompt_expansion: Enable prompt optimizer
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds
Returns:
Dictionary with video bytes, metadata, and cost
"""
return self.video.generate_text_video(
prompt=prompt,
resolution=resolution,
duration=duration,
audio_base64=audio_base64,
negative_prompt=negative_prompt,
seed=seed,
enable_prompt_expansion=enable_prompt_expansion,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
)
def upscale_video(
self,
video: str,
target_resolution: str = "1080p",
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Upscale video using FlashVSR.
Args:
video: Base64-encoded video data URI or public URL
target_resolution: Target resolution ("720p", "1080p", "2k", "4k")
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300 for long videos)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Upscaled video bytes
"""
return self.video.upscale_video(
video=video,
target_resolution=target_resolution,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
progress_callback=progress_callback,
)
def extend_video(
self,
video: str,
prompt: str,
model: str = "wan-2.5",
audio: Optional[str] = None,
negative_prompt: Optional[str] = None,
resolution: str = "720p",
duration: int = 5,
enable_prompt_expansion: bool = False,
generate_audio: bool = True,
camera_fixed: bool = False,
seed: Optional[int] = None,
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Extend video duration using WAN 2.5, WAN 2.2 Spicy, or Seedance 1.5 Pro video-extend.
Args:
video: Base64-encoded video data URI or public URL
prompt: Text prompt describing how to extend the video
model: Model to use ("wan-2.5", "wan-2.2-spicy", or "seedance-1.5-pro")
audio: Optional audio URL to guide generation (WAN 2.5 only)
negative_prompt: Optional negative prompt (WAN 2.5 only)
resolution: Output resolution (varies by model)
duration: Duration of extended video in seconds (varies by model)
enable_prompt_expansion: Enable prompt optimizer (WAN 2.5 only)
generate_audio: Generate audio for extended video (Seedance 1.5 Pro only)
camera_fixed: Fix camera position (Seedance 1.5 Pro only)
seed: Random seed for reproducibility (-1 for random)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Extended video bytes
"""
return self.video.extend_video(
video=video,
prompt=prompt,
model=model,
audio=audio,
negative_prompt=negative_prompt,
resolution=resolution,
duration=duration,
enable_prompt_expansion=enable_prompt_expansion,
generate_audio=generate_audio,
camera_fixed=camera_fixed,
seed=seed,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
progress_callback=progress_callback,
)
def face_swap(
self,
image: str,
video: str,
prompt: Optional[str] = None,
resolution: str = "480p",
seed: Optional[int] = None,
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Perform face/character swap using MoCha (wavespeed-ai/wan-2.1/mocha).
Args:
image: Base64-encoded image data URI or public URL (reference character)
video: Base64-encoded video data URI or public URL (source video)
prompt: Optional prompt to guide the swap
resolution: Output resolution ("480p" or "720p")
seed: Random seed for reproducibility (-1 for random)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Face-swapped video bytes
"""
return self.video.face_swap(
image=image,
video=video,
prompt=prompt,
resolution=resolution,
seed=seed,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
progress_callback=progress_callback,
)
def video_face_swap(
self,
video: str,
face_image: str,
target_gender: str = "all",
target_index: int = 0,
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Perform face swap using Video Face Swap (wavespeed-ai/video-face-swap).
Args:
video: Base64-encoded video data URI or public URL (source video)
face_image: Base64-encoded image data URI or public URL (reference face)
target_gender: Filter which faces to swap ("all", "female", "male")
target_index: Select which face to swap (0 = largest, 1 = second largest, etc.)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Face-swapped video bytes
"""
return self.video.video_face_swap(
video=video,
face_image=face_image,
target_gender=target_gender,
target_index=target_index,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
progress_callback=progress_callback,
)
def video_translate(
self,
video: str,
output_language: str = "English",
enable_sync_mode: bool = False,
timeout: int = 600,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Translate video to target language using HeyGen Video Translate.
Args:
video: Base64-encoded video data URI or public URL (source video)
output_language: Target language for translation (default: "English")
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 600)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Translated video bytes
"""
return self.video.video_translate(
video=video,
output_language=output_language,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
progress_callback=progress_callback,
)
def remove_background(
self,
video: str,
background_image: Optional[str] = None,
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Remove or replace video background using Video Background Remover.
Args:
video: Base64-encoded video data URI or public URL (source video)
background_image: Optional base64-encoded image data URI or public URL (replacement background)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Video with background removed/replaced
"""
return self.video.remove_background(
video=video,
background_image=background_image,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
progress_callback=progress_callback,
)
def hunyuan_video_foley(
self,
video: str,
prompt: Optional[str] = None,
seed: int = -1,
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Generate realistic Foley and ambient audio from video using Hunyuan Video Foley.
Args:
video: Base64-encoded video data URI or public URL (source video)
prompt: Optional text prompt describing desired sounds (e.g., "ocean waves, seagulls")
seed: Random seed for reproducibility (-1 for random)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Video with generated audio
"""
return self.video.hunyuan_video_foley(
video=video,
prompt=prompt,
seed=seed,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
progress_callback=progress_callback,
)
def think_sound(
self,
video: str,
prompt: Optional[str] = None,
seed: int = -1,
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Generate realistic sound effects and audio tracks from video using Think Sound.
Args:
video: Base64-encoded video data URI or public URL (source video)
prompt: Optional text prompt describing desired sounds (e.g., "engine roaring, footsteps on gravel")
seed: Random seed for reproducibility (-1 for random)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Video with generated audio
"""
return self.video.think_sound(
video=video,
prompt=prompt,
seed=seed,
enable_sync_mode=enable_sync_mode,
timeout=timeout,
progress_callback=progress_callback,
)

View File

@@ -0,0 +1 @@
"""WaveSpeed API generators for different content types."""

View File

@@ -0,0 +1,374 @@
"""
Image generation generator for WaveSpeed API.
"""
import time
import requests
from typing import Optional
from requests import exceptions as requests_exceptions
from fastapi import HTTPException
from utils.logger_utils import get_service_logger
logger = get_service_logger("wavespeed.generators.image")
class ImageGenerator:
"""Image generation generator."""
def __init__(self, api_key: str, base_url: str, polling):
"""Initialize image generator.
Args:
api_key: WaveSpeed API key
base_url: WaveSpeed API base URL
polling: WaveSpeedPolling instance for async operations
"""
self.api_key = api_key
self.base_url = base_url
self.polling = polling
def _get_headers(self) -> dict:
"""Get HTTP headers for API requests."""
return {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
}
def generate_image(
self,
model: str,
prompt: str,
width: int = 1024,
height: int = 1024,
num_inference_steps: Optional[int] = None,
guidance_scale: Optional[float] = None,
negative_prompt: Optional[str] = None,
seed: Optional[int] = None,
enable_sync_mode: bool = True,
timeout: int = 120,
**kwargs
) -> bytes:
"""
Generate image using WaveSpeed AI models (Ideogram V3 or Qwen Image).
Args:
model: Model to use ("ideogram-v3-turbo" or "qwen-image")
prompt: Text prompt for image generation
width: Image width (default: 1024)
height: Image height (default: 1024)
num_inference_steps: Number of inference steps
guidance_scale: Guidance scale for generation
negative_prompt: Negative prompt (what to avoid)
seed: Random seed for reproducibility
enable_sync_mode: If True, wait for result and return it directly (default: True)
timeout: Request timeout in seconds (default: 120)
**kwargs: Additional parameters
Returns:
bytes: Generated image bytes
"""
# Map model names to WaveSpeed API paths
model_paths = {
"ideogram-v3-turbo": "ideogram-ai/ideogram-v3-turbo",
"qwen-image": "wavespeed-ai/qwen-image/text-to-image",
}
model_path = model_paths.get(model)
if not model_path:
raise ValueError(f"Unsupported image model: {model}. Supported: {list(model_paths.keys())}")
url = f"{self.base_url}/{model_path}"
payload = {
"prompt": prompt,
"width": width,
"height": height,
"enable_sync_mode": enable_sync_mode,
}
# Add optional parameters
if num_inference_steps is not None:
payload["num_inference_steps"] = num_inference_steps
if guidance_scale is not None:
payload["guidance_scale"] = guidance_scale
if negative_prompt:
payload["negative_prompt"] = negative_prompt
if seed is not None:
payload["seed"] = seed
# Add any extra parameters
for key, value in kwargs.items():
if key not in payload:
payload[key] = value
logger.info(f"[WaveSpeed] Generating image via {url} (model={model}, prompt_length={len(prompt)})")
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Image generation failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed image generation failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
# Check status - if "created" or "processing", we need to poll even in sync mode
status = data.get("status", "").lower()
outputs = data.get("outputs") or []
prediction_id = data.get("id")
# Handle sync mode - result should be directly in outputs
if enable_sync_mode:
# If we have outputs and status is "completed", use them directly
if outputs and status == "completed":
logger.info(f"[WaveSpeed] Got immediate results from sync mode (status: {status})")
image_url = self._extract_image_url(outputs)
return self._download_image(image_url, timeout)
# Sync mode returned "created" or "processing" status - need to poll
if not prediction_id:
logger.error(f"[WaveSpeed] Sync mode returned status '{status}' but no prediction ID: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed sync mode returned async response without prediction ID",
)
logger.info(
f"[WaveSpeed] Sync mode returned status '{status}' with no outputs. "
f"Falling back to polling (prediction_id: {prediction_id})"
)
# Async mode OR sync mode that returned "created"/"processing" - poll for result
if not prediction_id:
logger.error(f"[WaveSpeed] No prediction ID in response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed response missing prediction id",
)
# Poll for result (use longer timeout for image generation)
logger.info(f"[WaveSpeed] Polling for image generation result (prediction_id: {prediction_id}, status: {status})")
result = self.polling.poll_until_complete(prediction_id, timeout_seconds=240, interval_seconds=1.0)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed image generator returned no outputs")
image_url = self._extract_image_url(outputs)
return self._download_image(image_url, timeout=60)
def generate_character_image(
self,
prompt: str,
reference_image_bytes: bytes,
style: str = "Auto",
aspect_ratio: str = "16:9",
rendering_speed: str = "Default",
timeout: Optional[int] = None,
) -> bytes:
"""
Generate image using Ideogram Character API to maintain character consistency.
Creates variations of a reference character image while respecting the base appearance.
Note: This API is always async and requires polling for results.
Args:
prompt: Text prompt describing the scene/context for the character
reference_image_bytes: Reference image bytes (base avatar)
style: Character style type ("Auto", "Fiction", or "Realistic")
aspect_ratio: Aspect ratio ("1:1", "16:9", "9:16", "4:3", "3:4")
rendering_speed: Rendering speed ("Default", "Turbo", "Quality")
timeout: Total timeout in seconds for submission + polling (default: 180)
Returns:
bytes: Generated image bytes with consistent character
"""
import base64
# Encode reference image to base64
image_base64 = base64.b64encode(reference_image_bytes).decode('utf-8')
# Add data URI prefix
image_data_uri = f"data:image/png;base64,{image_base64}"
url = f"{self.base_url}/ideogram-ai/ideogram-character"
payload = {
"prompt": prompt,
"image": image_data_uri,
"style": style,
"aspect_ratio": aspect_ratio,
"rendering_speed": rendering_speed,
}
logger.info(f"[WaveSpeed] Generating character image via Ideogram Character (prompt_length={len(prompt)})")
# Retry on transient connection failures
max_retries = 2
retry_delay = 2.0
for attempt in range(max_retries + 1):
try:
response = requests.post(
url,
headers=self._get_headers(),
json=payload,
timeout=(30, 30)
)
break
except (requests_exceptions.ConnectTimeout, requests_exceptions.ConnectionError) as e:
if attempt < max_retries:
logger.warning(f"[WaveSpeed] Connection attempt {attempt + 1}/{max_retries + 1} failed, retrying in {retry_delay}s: {e}")
time.sleep(retry_delay)
retry_delay *= 2
continue
else:
error_type = "Connection timeout" if isinstance(e, requests_exceptions.ConnectTimeout) else "Connection error"
logger.error(f"[WaveSpeed] {error_type} to Ideogram Character API after {max_retries + 1} attempts: {e}")
raise HTTPException(
status_code=504 if isinstance(e, requests_exceptions.ConnectTimeout) else 502,
detail={
"error": f"{error_type} to WaveSpeed Ideogram Character API",
"message": "Unable to establish connection to the image generation service after multiple attempts. Please check your network connection and try again.",
"exception": str(e),
"retry_recommended": True,
},
)
except requests_exceptions.Timeout as e:
logger.error(f"[WaveSpeed] Request timeout to Ideogram Character API: {e}")
raise HTTPException(
status_code=504,
detail={
"error": "Request timeout to WaveSpeed Ideogram Character API",
"message": "The image generation request took too long. Please try again.",
"exception": str(e),
},
)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Character image generation failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed Ideogram Character generation failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
# Extract prediction ID
prediction_id = data.get("id")
if not prediction_id:
logger.error(f"[WaveSpeed] No prediction ID in response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed Ideogram Character response missing prediction id",
)
# Ideogram Character API is always async - check status and poll if needed
outputs = data.get("outputs") or []
status = data.get("status", "unknown")
logger.info(f"[WaveSpeed] Ideogram Character task created: prediction_id={prediction_id}, status={status}")
# If status is already completed, use outputs directly (unlikely but possible)
if outputs and status == "completed":
logger.info(f"[WaveSpeed] Got immediate results from Ideogram Character")
else:
# Always need to poll for results (API is async)
logger.info(f"[WaveSpeed] Polling for Ideogram Character result (status: {status}, prediction_id: {prediction_id})")
polling_timeout = timeout if timeout else None
result = self.polling.poll_until_complete(
prediction_id,
timeout_seconds=polling_timeout,
interval_seconds=0.5,
)
if not isinstance(result, dict):
logger.error(f"[WaveSpeed] Unexpected result type: {type(result)}, value: {result}")
raise HTTPException(
status_code=502,
detail="WaveSpeed Ideogram Character returned unexpected response format",
)
outputs = result.get("outputs") or []
status = result.get("status", "unknown")
if status != "completed":
error_msg = "Unknown error"
if isinstance(result, dict):
error_msg = result.get("error") or result.get("message") or str(result.get("details", "Unknown error"))
else:
error_msg = str(result)
logger.error(f"[WaveSpeed] Ideogram Character task did not complete: status={status}, error={error_msg}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed Ideogram Character task failed",
"status": status,
"message": error_msg,
}
)
# Extract image URL from outputs
if not outputs:
logger.error(f"[WaveSpeed] No outputs after polling: status={status}")
raise HTTPException(
status_code=502,
detail="WaveSpeed Ideogram Character returned no outputs",
)
image_url = self._extract_image_url(outputs)
return self._download_image(image_url, timeout=60)
def _extract_image_url(self, outputs: list) -> str:
"""Extract image URL from outputs."""
if not isinstance(outputs, list) or len(outputs) == 0:
raise HTTPException(
status_code=502,
detail="WaveSpeed image generator output format not recognized",
)
first_output = outputs[0]
if isinstance(first_output, str):
image_url = first_output
elif isinstance(first_output, dict):
image_url = first_output.get("url") or first_output.get("image_url") or first_output.get("output")
else:
raise HTTPException(
status_code=502,
detail="WaveSpeed image generator output format not recognized",
)
if not image_url or not (image_url.startswith("http://") or image_url.startswith("https://")):
raise HTTPException(
status_code=502,
detail="WaveSpeed image generator output format not recognized",
)
return image_url
def _download_image(self, image_url: str, timeout: int = 60) -> bytes:
"""Download image from URL."""
logger.info(f"[WaveSpeed] Fetching image from URL: {image_url}")
image_response = requests.get(image_url, timeout=timeout)
if image_response.status_code == 200:
image_bytes = image_response.content
logger.info(f"[WaveSpeed] Image generated successfully (size: {len(image_bytes)} bytes)")
return image_bytes
else:
logger.error(f"[WaveSpeed] Failed to fetch image from URL: {image_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to fetch generated image from WaveSpeed URL",
)

View File

@@ -0,0 +1,164 @@
"""
Prompt optimization generator for WaveSpeed API.
"""
import requests
from typing import Optional
from fastapi import HTTPException
from utils.logger_utils import get_service_logger
logger = get_service_logger("wavespeed.generators.prompt")
class PromptGenerator:
"""Prompt optimization generator."""
def __init__(self, api_key: str, base_url: str, polling):
"""Initialize prompt generator.
Args:
api_key: WaveSpeed API key
base_url: WaveSpeed API base URL
polling: WaveSpeedPolling instance for async operations
"""
self.api_key = api_key
self.base_url = base_url
self.polling = polling
def _get_headers(self) -> dict:
"""Get HTTP headers for API requests."""
return {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
}
def optimize_prompt(
self,
text: str,
mode: str = "image",
style: str = "default",
image: Optional[str] = None,
enable_sync_mode: bool = True,
timeout: int = 30,
) -> str:
"""
Optimize a prompt using WaveSpeed prompt optimizer.
Args:
text: The prompt text to optimize
mode: "image" or "video" (default: "image")
style: "default", "artistic", "photographic", "technical", "anime", "realistic" (default: "default")
image: Base64-encoded image for context (optional)
enable_sync_mode: If True, wait for result and return it directly (default: True)
timeout: Request timeout in seconds (default: 30)
Returns:
Optimized prompt text
"""
model_path = "wavespeed-ai/prompt-optimizer"
url = f"{self.base_url}/{model_path}"
payload = {
"text": text,
"mode": mode,
"style": style,
"enable_sync_mode": enable_sync_mode,
}
if image:
payload["image"] = image
logger.info(f"[WaveSpeed] Optimizing prompt via {url} (mode={mode}, style={style})")
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Prompt optimization failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed prompt optimization failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
# Handle sync mode - result should be directly in outputs
if enable_sync_mode:
outputs = data.get("outputs") or []
if not outputs:
logger.error(f"[WaveSpeed] No outputs in sync mode response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed prompt optimizer returned no outputs",
)
# Extract optimized prompt from outputs
optimized_prompt = self._extract_prompt_from_outputs(outputs, timeout)
if not optimized_prompt:
logger.error(f"[WaveSpeed] Could not extract optimized prompt from outputs: {outputs}")
raise HTTPException(
status_code=502,
detail="WaveSpeed prompt optimizer output format not recognized",
)
logger.info(f"[WaveSpeed] Prompt optimized successfully (length: {len(optimized_prompt)} chars)")
return optimized_prompt
# Async mode - return prediction ID for polling
prediction_id = data.get("id")
if not prediction_id:
logger.error(f"[WaveSpeed] No prediction ID in async response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed response missing prediction id for async mode",
)
# Poll for result
result = self.polling.poll_until_complete(prediction_id, timeout_seconds=60, interval_seconds=0.5)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed prompt optimizer returned no outputs")
# Extract optimized prompt from outputs
optimized_prompt = self._extract_prompt_from_outputs(outputs, timeout)
if not optimized_prompt:
raise HTTPException(
status_code=502,
detail="WaveSpeed prompt optimizer output format not recognized",
)
logger.info(f"[WaveSpeed] Prompt optimized successfully (length: {len(optimized_prompt)} chars)")
return optimized_prompt
def _extract_prompt_from_outputs(self, outputs: list, timeout: int) -> Optional[str]:
"""Extract optimized prompt from outputs, handling URLs and direct text."""
if not isinstance(outputs, list) or len(outputs) == 0:
return None
first_output = outputs[0]
# If it's a string that looks like a URL, fetch it
if isinstance(first_output, str):
if first_output.startswith("http://") or first_output.startswith("https://"):
logger.info(f"[WaveSpeed] Fetching optimized prompt from URL: {first_output}")
url_response = requests.get(first_output, timeout=timeout)
if url_response.status_code == 200:
return url_response.text.strip()
else:
logger.error(f"[WaveSpeed] Failed to fetch prompt from URL: {url_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to fetch optimized prompt from WaveSpeed URL",
)
else:
# It's already the text
return first_output
elif isinstance(first_output, dict):
return first_output.get("text") or first_output.get("prompt") or first_output.get("output")
return None

View File

@@ -0,0 +1,223 @@
"""
Speech generation generator for WaveSpeed API.
"""
import time
import requests
from typing import Optional
from requests import exceptions as requests_exceptions
from fastapi import HTTPException
from utils.logger_utils import get_service_logger
logger = get_service_logger("wavespeed.generators.speech")
class SpeechGenerator:
"""Speech generation generator."""
def __init__(self, api_key: str, base_url: str, polling):
"""Initialize speech generator.
Args:
api_key: WaveSpeed API key
base_url: WaveSpeed API base URL
polling: WaveSpeedPolling instance for async operations
"""
self.api_key = api_key
self.base_url = base_url
self.polling = polling
def _get_headers(self) -> dict:
"""Get HTTP headers for API requests."""
return {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
}
def generate_speech(
self,
text: str,
voice_id: str,
speed: float = 1.0,
volume: float = 1.0,
pitch: float = 0.0,
emotion: str = "happy",
enable_sync_mode: bool = True,
timeout: int = 120,
**kwargs
) -> bytes:
"""
Generate speech audio using Minimax Speech 02 HD via WaveSpeed.
Args:
text: Text to convert to speech (max 10000 characters)
voice_id: Voice ID (e.g., "Wise_Woman", "Friendly_Person", etc.)
speed: Speech speed (0.5-2.0, default: 1.0)
volume: Speech volume (0.1-10.0, default: 1.0)
pitch: Speech pitch (-12 to 12, default: 0.0)
emotion: Emotion ("happy", "sad", "angry", etc., default: "happy")
enable_sync_mode: If True, wait for result and return it directly (default: True)
timeout: Request timeout in seconds (default: 60)
**kwargs: Additional parameters (sample_rate, bitrate, format, etc.)
Returns:
bytes: Generated audio bytes
"""
model_path = "minimax/speech-02-hd"
url = f"{self.base_url}/{model_path}"
payload = {
"text": text,
"voice_id": voice_id,
"speed": speed,
"volume": volume,
"pitch": pitch,
"emotion": emotion,
"enable_sync_mode": enable_sync_mode,
}
# Add optional parameters
optional_params = [
"english_normalization",
"sample_rate",
"bitrate",
"channel",
"format",
"language_boost",
]
for param in optional_params:
if param in kwargs:
payload[param] = kwargs[param]
logger.info(f"[WaveSpeed] Generating speech via {url} (voice={voice_id}, text_length={len(text)})")
# Retry on transient connection issues
max_retries = 2
retry_delay = 2.0
for attempt in range(max_retries + 1):
try:
response = requests.post(
url,
headers=self._get_headers(),
json=payload,
timeout=(30, 60), # connect, read
)
break
except (requests_exceptions.ConnectTimeout, requests_exceptions.ConnectionError) as e:
if attempt < max_retries:
logger.warning(
f"[WaveSpeed] Speech connection attempt {attempt + 1}/{max_retries + 1} failed, "
f"retrying in {retry_delay}s: {e}"
)
time.sleep(retry_delay)
retry_delay *= 2
continue
logger.error(f"[WaveSpeed] Speech connection failed after {max_retries + 1} attempts: {e}")
raise HTTPException(
status_code=504,
detail={
"error": "Connection to WaveSpeed speech API timed out",
"message": "Unable to reach the speech service. Please try again.",
"exception": str(e),
"retry_recommended": True,
},
)
except requests_exceptions.Timeout as e:
logger.error(f"[WaveSpeed] Speech request timeout: {e}")
raise HTTPException(
status_code=504,
detail={
"error": "WaveSpeed speech request timed out",
"message": "The speech generation request took too long. Please try again.",
"exception": str(e),
},
)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Speech generation failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed speech generation failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
# Handle sync mode - result should be directly in outputs
if enable_sync_mode:
outputs = data.get("outputs") or []
if not outputs:
logger.error(f"[WaveSpeed] No outputs in sync mode response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed speech generator returned no outputs",
)
audio_url = self._extract_audio_url(outputs)
return self._download_audio(audio_url, timeout)
# Async mode - return prediction ID for polling
prediction_id = data.get("id")
if not prediction_id:
logger.error(f"[WaveSpeed] No prediction ID in async response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed response missing prediction id for async mode",
)
# Poll for result
result = self.polling.poll_until_complete(prediction_id, timeout_seconds=120, interval_seconds=0.5)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed speech generator returned no outputs")
audio_url = self._extract_audio_url(outputs)
return self._download_audio(audio_url, timeout)
def _extract_audio_url(self, outputs: list) -> str:
"""Extract audio URL from outputs."""
if not isinstance(outputs, list) or len(outputs) == 0:
raise HTTPException(
status_code=502,
detail="WaveSpeed speech generator output format not recognized",
)
first_output = outputs[0]
if isinstance(first_output, str):
audio_url = first_output
elif isinstance(first_output, dict):
audio_url = first_output.get("url") or first_output.get("output")
else:
raise HTTPException(
status_code=502,
detail="WaveSpeed speech generator output format not recognized",
)
if not audio_url or not (audio_url.startswith("http://") or audio_url.startswith("https://")):
raise HTTPException(
status_code=502,
detail="WaveSpeed speech generator output format not recognized",
)
return audio_url
def _download_audio(self, audio_url: str, timeout: int) -> bytes:
"""Download audio from URL."""
logger.info(f"[WaveSpeed] Fetching audio from URL: {audio_url}")
audio_response = requests.get(audio_url, timeout=timeout)
if audio_response.status_code == 200:
audio_bytes = audio_response.content
logger.info(f"[WaveSpeed] Speech generated successfully (size: {len(audio_bytes)} bytes)")
return audio_bytes
else:
logger.error(f"[WaveSpeed] Failed to fetch audio from URL: {audio_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to fetch generated audio from WaveSpeed URL",
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,253 @@
"""
Hunyuan Avatar Service
Service for creating talking avatars using Hunyuan Avatar model.
Reference: https://wavespeed.ai/models/wavespeed-ai/hunyuan-avatar
"""
from __future__ import annotations
import base64
from typing import Any, Dict, Optional
import requests
from fastapi import HTTPException
from loguru import logger
from .client import WaveSpeedClient
HUNYUAN_AVATAR_MODEL_PATH = "wavespeed-ai/hunyuan-avatar"
HUNYUAN_AVATAR_MODEL_NAME = "wavespeed-ai/hunyuan-avatar"
MAX_IMAGE_BYTES = 10 * 1024 * 1024 # 10MB
MAX_AUDIO_BYTES = 50 * 1024 * 1024 # 50MB safety cap
MAX_DURATION_SECONDS = 120 # 2 minutes maximum
MIN_DURATION_SECONDS = 5 # Minimum billable duration
def _as_data_uri(content_bytes: bytes, mime_type: str) -> str:
"""Convert bytes to data URI."""
encoded = base64.b64encode(content_bytes).decode("utf-8")
return f"data:{mime_type};base64,{encoded}"
def calculate_hunyuan_avatar_cost(resolution: str, duration: float) -> float:
"""
Calculate cost for Hunyuan Avatar video.
Pricing:
- 480p: $0.15 per 5 seconds
- 720p: $0.30 per 5 seconds
- Minimum charge: 5 seconds
- Maximum billable: 120 seconds
Args:
resolution: Output resolution (480p or 720p)
duration: Video duration in seconds
Returns:
Cost in USD
"""
# Clamp duration to valid range
actual_duration = max(MIN_DURATION_SECONDS, min(duration, MAX_DURATION_SECONDS))
# Calculate cost per 5 seconds
cost_per_5_seconds = 0.15 if resolution == "480p" else 0.30
# Round up to nearest 5 seconds
billable_5_second_blocks = (actual_duration + 4) // 5 # Ceiling division
return cost_per_5_seconds * billable_5_second_blocks
def create_hunyuan_avatar(
*,
image_bytes: bytes,
audio_bytes: bytes,
resolution: str = "480p",
prompt: Optional[str] = None,
seed: Optional[int] = None,
user_id: str = "video_studio",
image_mime: str = "image/png",
audio_mime: str = "audio/mpeg",
client: Optional[WaveSpeedClient] = None,
progress_callback: Optional[callable] = None,
) -> Dict[str, Any]:
"""
Create talking avatar video using Hunyuan Avatar.
Reference: https://wavespeed.ai/docs/docs-api/wavespeed-ai/hunyuan-avatar
Args:
image_bytes: Portrait image as bytes
audio_bytes: Audio file as bytes
resolution: Output resolution (480p or 720p, default: 480p)
prompt: Optional text to guide expression or style
seed: Optional random seed (-1 for random)
user_id: User ID for tracking
image_mime: MIME type of image
audio_mime: MIME type of audio
client: Optional WaveSpeedClient instance
progress_callback: Optional progress callback function
Returns:
Dictionary with video_bytes, prompt, duration, model_name, cost, etc.
"""
if not image_bytes:
raise HTTPException(status_code=400, detail="Image bytes are required for Hunyuan Avatar.")
if not audio_bytes:
raise HTTPException(status_code=400, detail="Audio bytes are required for Hunyuan Avatar.")
if len(image_bytes) > MAX_IMAGE_BYTES:
raise HTTPException(
status_code=400,
detail=f"Image exceeds {MAX_IMAGE_BYTES / (1024 * 1024):.0f}MB limit required by Hunyuan Avatar.",
)
if len(audio_bytes) > MAX_AUDIO_BYTES:
raise HTTPException(
status_code=400,
detail=f"Audio exceeds {MAX_AUDIO_BYTES / (1024 * 1024):.0f}MB limit allowed for Hunyuan Avatar requests.",
)
if resolution not in {"480p", "720p"}:
raise HTTPException(status_code=400, detail="Resolution must be '480p' or '720p'.")
# Build payload
payload: Dict[str, Any] = {
"image": _as_data_uri(image_bytes, image_mime),
"audio": _as_data_uri(audio_bytes, audio_mime),
"resolution": resolution,
}
if prompt:
payload["prompt"] = prompt.strip()
if seed is not None:
payload["seed"] = seed
client = client or WaveSpeedClient()
# Progress callback: submission
if progress_callback:
progress_callback(10.0, "Submitting Hunyuan Avatar request to WaveSpeed...")
prediction_id = client.submit_image_to_video(HUNYUAN_AVATAR_MODEL_PATH, payload, timeout=60)
try:
# Poll for completion
if progress_callback:
progress_callback(20.0, f"Polling for completion (prediction_id: {prediction_id})...")
result = client.poll_until_complete(
prediction_id,
timeout_seconds=600, # 10 minutes max
interval_seconds=0.5, # Poll every 0.5 seconds
progress_callback=progress_callback,
)
except HTTPException as exc:
detail = exc.detail or {}
if isinstance(detail, dict):
detail.setdefault("prediction_id", prediction_id)
detail.setdefault("resume_available", True)
raise
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(
status_code=502,
detail={
"error": "Hunyuan Avatar completed but returned no outputs",
"prediction_id": prediction_id,
}
)
video_url = outputs[0]
if not isinstance(video_url, str) or not video_url.startswith("http"):
raise HTTPException(
status_code=502,
detail={
"error": f"Invalid video URL format: {video_url}",
"prediction_id": prediction_id,
}
)
# Progress callback: downloading video
if progress_callback:
progress_callback(90.0, "Downloading generated video...")
# Download video
try:
video_response = requests.get(video_url, timeout=180)
if video_response.status_code != 200:
raise HTTPException(
status_code=502,
detail={
"error": "Failed to download Hunyuan Avatar video",
"status_code": video_response.status_code,
"response": video_response.text[:200],
"prediction_id": prediction_id,
}
)
except requests.exceptions.RequestException as e:
raise HTTPException(
status_code=502,
detail={
"error": f"Failed to download video: {str(e)}",
"prediction_id": prediction_id,
}
)
video_bytes = video_response.content
if len(video_bytes) == 0:
raise HTTPException(
status_code=502,
detail={
"error": "Downloaded video is empty",
"prediction_id": prediction_id,
}
)
# Estimate duration (we don't get exact duration from API, so estimate from audio or use default)
# For now, we'll use a default estimate - in production, you might want to analyze the audio file
estimated_duration = 10.0 # Default estimate
# Calculate cost
cost = calculate_hunyuan_avatar_cost(resolution, estimated_duration)
# Get video dimensions from resolution
resolution_dims = {
"480p": (854, 480),
"720p": (1280, 720),
}
width, height = resolution_dims.get(resolution, (854, 480))
# Extract metadata
metadata = result.get("metadata", {})
metadata.update({
"has_nsfw_contents": result.get("has_nsfw_contents", []),
"created_at": result.get("created_at"),
"resolution": resolution,
"max_duration": MAX_DURATION_SECONDS,
})
logger.info(
f"[Hunyuan Avatar] ✅ Generated video: {len(video_bytes)} bytes, "
f"resolution={resolution}, cost=${cost:.2f}"
)
# Progress callback: completed
if progress_callback:
progress_callback(100.0, "Avatar generation completed!")
return {
"video_bytes": video_bytes,
"prompt": prompt or "",
"duration": estimated_duration,
"model_name": HUNYUAN_AVATAR_MODEL_NAME,
"cost": cost,
"provider": "wavespeed",
"resolution": resolution,
"width": width,
"height": height,
"metadata": metadata,
"source_video_url": video_url,
"prediction_id": prediction_id,
}

View File

@@ -0,0 +1,191 @@
from __future__ import annotations
import base64
from typing import Any, Dict, Optional
import requests
from fastapi import HTTPException
from loguru import logger
from .client import WaveSpeedClient
INFINITALK_MODEL_PATH = "wavespeed-ai/infinitetalk"
INFINITALK_MODEL_NAME = "wavespeed-ai/infinitetalk"
INFINITALK_DEFAULT_COST = 0.30 # $0.30 per 5 seconds at 720p tier
MAX_IMAGE_BYTES = 10 * 1024 * 1024 # 10MB
MAX_AUDIO_BYTES = 50 * 1024 * 1024 # 50MB safety cap
def _as_data_uri(content_bytes: bytes, mime_type: str) -> str:
encoded = base64.b64encode(content_bytes).decode("utf-8")
return f"data:{mime_type};base64,{encoded}"
def _generate_simple_infinitetalk_prompt(
scene_data: Dict[str, Any],
story_context: Dict[str, Any],
) -> Optional[str]:
"""
Generate a balanced, concise prompt for InfiniteTalk.
InfiniteTalk is audio-driven, so the prompt should describe the scene and suggest
subtle motion, but avoid overly elaborate cinematic descriptions.
Returns None if no meaningful prompt can be generated.
"""
title = (scene_data.get("title") or "").strip()
description = (scene_data.get("description") or "").strip()
image_prompt = (scene_data.get("image_prompt") or "").strip()
# Build a balanced prompt: scene description + simple motion hint
parts = []
# Start with the main subject/scene
if title and len(title) > 5 and title.lower() not in ("scene", "podcast", "episode"):
parts.append(title)
elif description:
# Take first sentence or first 60 chars
desc_part = description.split('.')[0][:60].strip()
if desc_part:
parts.append(desc_part)
elif image_prompt:
# Take first sentence or first 60 chars
img_part = image_prompt.split('.')[0][:60].strip()
if img_part:
parts.append(img_part)
if not parts:
return None
# Add a simple, subtle motion suggestion (not elaborate camera movements)
# Keep it natural and audio-driven
motion_hints = [
"with subtle movement",
"with gentle motion",
"with natural animation",
]
# Combine scene description with subtle motion hint
if len(parts[0]) < 80:
# Room for a motion hint
prompt = f"{parts[0]}, {motion_hints[0]}"
else:
# Just use the description if it's already long enough
prompt = parts[0]
# Keep it concise - max 120 characters (allows for scene + motion hint)
prompt = prompt[:120].strip()
# Clean up trailing commas or incomplete sentences
if prompt.endswith(','):
prompt = prompt[:-1].strip()
return prompt if len(prompt) >= 15 else None
def animate_scene_with_voiceover(
*,
image_bytes: bytes,
audio_bytes: bytes,
scene_data: Dict[str, Any],
story_context: Dict[str, Any],
user_id: str,
resolution: str = "720p",
prompt_override: Optional[str] = None,
mask_image_bytes: Optional[bytes] = None,
seed: Optional[int] = -1,
image_mime: str = "image/png",
audio_mime: str = "audio/mpeg",
client: Optional[WaveSpeedClient] = None,
) -> Dict[str, Any]:
"""
Animate a scene image with narration audio using WaveSpeed InfiniteTalk.
Returns dict with video bytes, prompt used, model name, and cost.
"""
if not image_bytes:
raise HTTPException(status_code=404, detail="Scene image bytes missing for animation.")
if not audio_bytes:
raise HTTPException(status_code=404, detail="Scene audio bytes missing for animation.")
if len(image_bytes) > MAX_IMAGE_BYTES:
raise HTTPException(
status_code=400,
detail="Scene image exceeds 10MB limit required by WaveSpeed InfiniteTalk.",
)
if len(audio_bytes) > MAX_AUDIO_BYTES:
raise HTTPException(
status_code=400,
detail="Scene audio exceeds 50MB limit allowed for InfiniteTalk requests.",
)
if resolution not in {"480p", "720p"}:
raise HTTPException(status_code=400, detail="Resolution must be '480p' or '720p'.")
# Generate simple, concise prompt for InfiniteTalk (audio-driven, less need for elaborate descriptions)
animation_prompt = prompt_override or _generate_simple_infinitetalk_prompt(scene_data, story_context)
payload: Dict[str, Any] = {
"image": _as_data_uri(image_bytes, image_mime),
"audio": _as_data_uri(audio_bytes, audio_mime),
"resolution": resolution,
}
# Only include prompt if we have a meaningful one (InfiniteTalk works fine without it)
if animation_prompt:
payload["prompt"] = animation_prompt
if mask_image_bytes:
payload["mask_image"] = _as_data_uri(mask_image_bytes, image_mime)
if seed is not None:
payload["seed"] = seed
client = client or WaveSpeedClient()
prediction_id = client.submit_image_to_video(INFINITALK_MODEL_PATH, payload, timeout=60)
try:
# Poll faster (0.5s) to mirror reference pattern; allow up to 10 minutes
result = client.poll_until_complete(prediction_id, timeout_seconds=600, interval_seconds=0.5)
except HTTPException as exc:
detail = exc.detail or {}
if isinstance(detail, dict):
detail.setdefault("prediction_id", prediction_id)
detail.setdefault("resume_available", True)
raise
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed InfiniteTalk completed but returned no outputs.")
video_url = outputs[0]
video_response = requests.get(video_url, timeout=180)
if video_response.status_code != 200:
raise HTTPException(
status_code=502,
detail={
"error": "Failed to download InfiniteTalk video",
"status_code": video_response.status_code,
"response": video_response.text[:200],
},
)
metadata = result.get("metadata") or {}
duration = metadata.get("duration_seconds") or metadata.get("duration") or 0
logger.info(
"[InfiniteTalk] Generated talking avatar video user=%s scene=%s resolution=%s size=%s bytes",
user_id,
scene_data.get("scene_number"),
resolution,
len(video_response.content),
)
return {
"video_bytes": video_response.content,
"prompt": animation_prompt,
"duration": duration or 5,
"model_name": INFINITALK_MODEL_NAME,
"cost": INFINITALK_DEFAULT_COST,
"provider": "wavespeed",
"source_video_url": video_url,
"prediction_id": prediction_id,
}

View File

@@ -0,0 +1,360 @@
from __future__ import annotations
import base64
import json
from typing import Any, Dict, Optional
import requests
from fastapi import HTTPException
from services.llm_providers.main_text_generation import llm_text_gen
from utils.logger_utils import get_service_logger
from .client import WaveSpeedClient
try:
import imghdr
except ModuleNotFoundError: # Python 3.13 removed imghdr
imghdr = None
logger = get_service_logger("wavespeed.kling_animation")
KLING_MODEL_PATH = "kwaivgi/kling-v2.5-turbo-std/image-to-video"
KLING_MODEL_5S = "kling-v2.5-turbo-std-5s"
KLING_MODEL_10S = "kling-v2.5-turbo-std-10s"
MAX_IMAGE_BYTES = 10 * 1024 * 1024 # 10 MB limit per docs
def _detect_image_mime(image_bytes: bytes) -> str:
if imghdr:
detected = imghdr.what(None, h=image_bytes)
if detected == "jpeg":
return "image/jpeg"
if detected == "png":
return "image/png"
if detected == "gif":
return "image/gif"
header = image_bytes[:8]
if header.startswith(b"\x89PNG"):
return "image/png"
if header[:2] == b"\xff\xd8":
return "image/jpeg"
if header[:3] in (b"GIF", b"GIF"):
return "image/gif"
return "image/png"
def _build_fallback_prompt(scene_data: Dict[str, Any], story_context: Dict[str, Any]) -> str:
title = (scene_data.get("title") or "Scene").strip()
description = (scene_data.get("description") or "").strip()
image_prompt = (scene_data.get("image_prompt") or "").strip()
tone = (story_context.get("story_tone") or "story").strip()
setting = (story_context.get("story_setting") or "the scene").strip()
parts = [
f"{title} cinematic motion shot.",
description[:220] if description else "",
f"Camera glides with subtle parallax over {setting}.",
f"Maintain a {tone} mood with natural lighting accents.",
f"Honor the original illustration details: {image_prompt[:200]}." if image_prompt else "",
"5-second sequence, gentle push-in, flowing cloth and atmospheric particles.",
]
fallback_prompt = " ".join(filter(None, parts))
return fallback_prompt.strip()
def _load_llm_json_response(response_text: Any) -> Dict[str, Any]:
"""Normalize responses from llm_text_gen (dict or JSON string)."""
if isinstance(response_text, dict):
return response_text
if isinstance(response_text, str):
return json.loads(response_text)
raise ValueError(f"Unexpected response type: {type(response_text)}")
def _generate_text_prompt(
*,
prompt: str,
system_prompt: str,
user_id: str,
fallback_prompt: str,
) -> str:
"""Fallback text generation when structured JSON parsing fails."""
try:
response = llm_text_gen(
prompt=prompt.strip(),
system_prompt=system_prompt,
user_id=user_id,
)
except HTTPException as exc:
if exc.status_code == 429:
raise
logger.warning(
"[AnimateScene] Text-mode prompt generation failed (%s). Using deterministic fallback.",
exc.detail,
)
return fallback_prompt
except Exception as exc:
logger.error(
"[AnimateScene] Unexpected error generating text prompt: %s",
exc,
exc_info=True,
)
return fallback_prompt
if isinstance(response, dict):
candidates = [
response.get("animation_prompt"),
response.get("prompt"),
response.get("text"),
]
for candidate in candidates:
if isinstance(candidate, str) and candidate.strip():
return candidate.strip()
# As a last resort, stringify the dict
response_text = json.dumps(response, ensure_ascii=False)
else:
response_text = str(response)
cleaned = response_text.strip()
return cleaned or fallback_prompt
def generate_animation_prompt(
scene_data: Dict[str, Any],
story_context: Dict[str, Any],
user_id: str,
) -> str:
"""
Generate an animation-focused prompt using llm_text_gen, falling back to a deterministic prompt if LLM fails.
"""
fallback_prompt = _build_fallback_prompt(scene_data, story_context)
system_prompt = (
"You are an expert cinematic animation director. "
"You transform static illustrated scenes into short cinematic motion clips. "
"Describe motion, camera behavior, atmosphere, and pacing."
)
description = scene_data.get("description", "")
image_prompt = scene_data.get("image_prompt", "")
title = scene_data.get("title", "")
tone = story_context.get("story_tone") or story_context.get("story_tone", "")
setting = story_context.get("story_setting") or story_context.get("story_setting", "")
prompt = f"""
Create a concise animation prompt (2-3 sentences) for a 5-second cinematic clip.
Scene Title: {title}
Description: {description}
Existing Image Prompt: {image_prompt}
Story Tone: {tone}
Setting: {setting}
Focus on:
- Motion of characters/objects
- Camera movement (pan, zoom, dolly, orbit)
- Atmosphere, lighting, and emotion
- Timing cues appropriate for a {tone or "story"} scene
Respond with JSON: {{"animation_prompt": "<prompt>"}}
"""
try:
response = llm_text_gen(
prompt=prompt.strip(),
system_prompt=system_prompt,
user_id=user_id,
json_struct={
"type": "object",
"properties": {
"animation_prompt": {
"type": "string",
"description": "A cinematic motion prompt for the WaveSpeed image-to-video model.",
}
},
"required": ["animation_prompt"],
},
)
structured = _load_llm_json_response(response)
animation_prompt = structured.get("animation_prompt")
if not animation_prompt or not isinstance(animation_prompt, str):
raise ValueError("Missing animation_prompt in structured response")
cleaned_prompt = animation_prompt.strip()
if not cleaned_prompt:
raise ValueError("animation_prompt is empty after trimming")
return cleaned_prompt
except HTTPException as exc:
if exc.status_code == 429:
raise
logger.warning(
"[AnimateScene] Structured LLM prompt generation failed (%s). Falling back to text parsing.",
exc.detail,
)
return _generate_text_prompt(
prompt=prompt,
system_prompt=system_prompt,
user_id=user_id,
fallback_prompt=fallback_prompt,
)
except (json.JSONDecodeError, ValueError, KeyError) as exc:
logger.warning(
"[AnimateScene] Failed to parse structured animation prompt (%s). Falling back to text parsing.",
exc,
)
return _generate_text_prompt(
prompt=prompt,
system_prompt=system_prompt,
user_id=user_id,
fallback_prompt=fallback_prompt,
)
except Exception as exc:
logger.error(
"[AnimateScene] Unexpected error generating animation prompt: %s",
exc,
exc_info=True,
)
return fallback_prompt
def animate_scene_image(
*,
image_bytes: bytes,
scene_data: Dict[str, Any],
story_context: Dict[str, Any],
user_id: str,
duration: int = 5,
guidance_scale: float = 0.5,
negative_prompt: Optional[str] = None,
client: Optional[WaveSpeedClient] = None,
) -> Dict[str, Any]:
"""
Animate a scene image using WaveSpeed Kling v2.5 Turbo Std.
Returns dict with video bytes, prompt used, model name, duration, and cost.
"""
if duration not in (5, 10):
raise HTTPException(status_code=400, detail="Duration must be 5 or 10 seconds for scene animation.")
if len(image_bytes) > MAX_IMAGE_BYTES:
raise HTTPException(
status_code=400,
detail="Scene image exceeds 10MB limit required by WaveSpeed."
)
guidance_scale = max(0.0, min(1.0, guidance_scale))
animation_prompt = generate_animation_prompt(scene_data, story_context, user_id)
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
payload = {
"duration": duration,
"guidance_scale": guidance_scale,
"image": image_b64,
"prompt": animation_prompt,
}
if negative_prompt:
payload["negative_prompt"] = negative_prompt.strip()
client = client or WaveSpeedClient()
prediction_id = client.submit_image_to_video(KLING_MODEL_PATH, payload)
try:
result = client.poll_until_complete(prediction_id, timeout_seconds=240, interval_seconds=1.0)
except HTTPException as exc:
detail = exc.detail or {}
if isinstance(detail, dict):
detail.setdefault("prediction_id", prediction_id)
detail.setdefault("resume_available", True)
detail.setdefault("message", "WaveSpeed request is still processing. Use resume endpoint to fetch the video once ready.")
raise HTTPException(status_code=exc.status_code, detail=detail)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed completed but returned no outputs.")
video_url = outputs[0]
video_response = requests.get(video_url, timeout=60)
if video_response.status_code != 200:
raise HTTPException(
status_code=502,
detail={
"error": "Failed to download animation video",
"status_code": video_response.status_code,
"response": video_response.text[:200],
},
)
model_name = KLING_MODEL_5S if duration == 5 else KLING_MODEL_10S
cost = 0.21 if duration == 5 else 0.42
return {
"video_bytes": video_response.content,
"prompt": animation_prompt,
"duration": duration,
"model_name": model_name,
"cost": cost,
"provider": "wavespeed",
"source_video_url": video_url,
"prediction_id": prediction_id,
}
def resume_scene_animation(
*,
prediction_id: str,
duration: int,
user_id: str,
client: Optional[WaveSpeedClient] = None,
) -> Dict[str, Any]:
"""
Resume a previously submitted animation by fetching the completed result.
"""
if duration not in (5, 10):
raise HTTPException(status_code=400, detail="Duration must be 5 or 10 seconds for scene animation.")
client = client or WaveSpeedClient()
result = client.get_prediction_result(prediction_id, timeout=120)
status = result.get("status")
if status != "completed":
raise HTTPException(
status_code=409,
detail={
"error": "WaveSpeed prediction is not completed yet",
"prediction_id": prediction_id,
"status": status,
},
)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed completed but returned no outputs.")
video_url = outputs[0]
video_response = requests.get(video_url, timeout=120)
if video_response.status_code != 200:
raise HTTPException(
status_code=502,
detail={
"error": "Failed to download animation video during resume",
"status_code": video_response.status_code,
"response": video_response.text[:200],
"prediction_id": prediction_id,
},
)
animation_prompt = result.get("prompt") or ""
model_name = KLING_MODEL_5S if duration == 5 else KLING_MODEL_10S
cost = 0.21 if duration == 5 else 0.42
logger.info("[AnimateScene] Resumed download for prediction=%s", prediction_id)
return {
"video_bytes": video_response.content,
"prompt": animation_prompt,
"duration": duration,
"model_name": model_name,
"cost": cost,
"provider": "wavespeed",
"source_video_url": video_url,
"prediction_id": prediction_id,
}

View File

@@ -0,0 +1,203 @@
"""
Polling utilities for WaveSpeed API.
"""
import time
from typing import Any, Dict, Optional, Callable
import requests
from fastapi import HTTPException
from requests import exceptions as requests_exceptions
from utils.logger_utils import get_service_logger
logger = get_service_logger("wavespeed.polling")
class WaveSpeedPolling:
"""Polling utilities for WaveSpeed API predictions."""
def __init__(self, api_key: str, base_url: str):
"""Initialize polling utilities.
Args:
api_key: WaveSpeed API key
base_url: WaveSpeed API base URL
"""
self.api_key = api_key
self.base_url = base_url
def _get_headers(self) -> Dict[str, str]:
"""Get HTTP headers for API requests."""
return {"Authorization": f"Bearer {self.api_key}"}
def get_prediction_result(self, prediction_id: str, timeout: int = 30) -> Dict[str, Any]:
"""
Fetch the current status/result for a prediction.
Matches the example pattern: simple GET request, check status_code == 200, return data.
"""
url = f"{self.base_url}/predictions/{prediction_id}/result"
headers = self._get_headers()
try:
response = requests.get(url, headers=headers, timeout=timeout)
except requests_exceptions.Timeout as exc:
raise HTTPException(
status_code=504,
detail={
"error": "WaveSpeed polling request timed out",
"prediction_id": prediction_id,
"resume_available": True,
"exception": str(exc),
},
) from exc
except requests_exceptions.RequestException as exc:
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed polling request failed",
"prediction_id": prediction_id,
"resume_available": True,
"exception": str(exc),
},
) from exc
# Match example pattern: check status_code == 200, then get data
if response.status_code == 200:
result = response.json().get("data")
if not result:
raise HTTPException(status_code=502, detail={"error": "WaveSpeed polling response missing data"})
return result
else:
# Non-200 status - log and raise error (matching example's break behavior)
logger.error(f"[WaveSpeed] Polling failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed prediction polling failed",
"status_code": response.status_code,
"response": response.text,
},
)
def poll_until_complete(
self,
prediction_id: str,
timeout_seconds: Optional[int] = None,
interval_seconds: float = 1.0,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> Dict[str, Any]:
"""
Poll WaveSpeed until the job completes or fails.
Matches the example pattern: simple polling loop until status is "completed" or "failed".
Args:
prediction_id: The prediction ID to poll for
timeout_seconds: Optional timeout in seconds. If None, polls indefinitely until completion/failure.
interval_seconds: Seconds to wait between polling attempts (default: 1.0, faster than 2.0)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
Dict containing the completed result
Raises:
HTTPException: If the task fails, polling fails, or times out (if timeout_seconds is set)
"""
start_time = time.time()
consecutive_errors = 0
max_consecutive_errors = 6 # safety guard for non-transient errors
while True:
try:
result = self.get_prediction_result(prediction_id)
consecutive_errors = 0 # Reset error counter on success
except HTTPException as exc:
detail = exc.detail or {}
if isinstance(detail, dict):
detail.setdefault("prediction_id", prediction_id)
detail.setdefault("resume_available", True)
detail.setdefault("error", detail.get("error", "WaveSpeed polling failed"))
# Determine underlying status code (WaveSpeed vs proxy)
status_code = detail.get("status_code", exc.status_code)
# Treat 5xx as transient: keep polling indefinitely with backoff
if 500 <= int(status_code) < 600:
consecutive_errors += 1
backoff = min(30.0, interval_seconds * (2 ** (consecutive_errors - 1)))
logger.warning(
f"[WaveSpeed] Transient polling error {consecutive_errors} for {prediction_id}: "
f"{status_code}. Backing off {backoff:.1f}s"
)
time.sleep(backoff)
continue
# For non-transient (typically 4xx) errors, apply safety cap
consecutive_errors += 1
if consecutive_errors >= max_consecutive_errors:
logger.error(
f"[WaveSpeed] Too many polling errors ({consecutive_errors}) for {prediction_id}, "
f"status_code={status_code}. Giving up."
)
raise HTTPException(status_code=exc.status_code, detail=detail) from exc
backoff = min(30.0, interval_seconds * (2 ** (consecutive_errors - 1)))
logger.warning(
f"[WaveSpeed] Polling error {consecutive_errors}/{max_consecutive_errors} for {prediction_id}: "
f"{status_code}. Backing off {backoff:.1f}s"
)
time.sleep(backoff)
continue
# Extract status from result (matching example pattern)
status = result.get("status")
if status == "completed":
elapsed = time.time() - start_time
logger.info(f"[WaveSpeed] Prediction {prediction_id} completed in {elapsed:.1f}s")
return result
if status == "failed":
error_msg = result.get("error", "Unknown error")
logger.error(f"[WaveSpeed] Prediction {prediction_id} failed: {error_msg}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed task failed",
"prediction_id": prediction_id,
"message": error_msg,
"details": result,
},
)
# Check timeout only if specified
if timeout_seconds is not None:
elapsed = time.time() - start_time
if elapsed > timeout_seconds:
logger.error(f"[WaveSpeed] Prediction {prediction_id} timed out after {timeout_seconds}s")
raise HTTPException(
status_code=504,
detail={
"error": "WaveSpeed task timed out",
"prediction_id": prediction_id,
"timeout_seconds": timeout_seconds,
"current_status": status,
"message": f"Task did not complete within {timeout_seconds} seconds. Status: {status}",
},
)
# Log progress periodically (every 30 seconds)
elapsed = time.time() - start_time
if int(elapsed) % 30 == 0 and elapsed > 0:
logger.info(f"[WaveSpeed] Polling {prediction_id}: status={status}, elapsed={elapsed:.0f}s")
# Call progress callback if provided
if progress_callback:
# Map elapsed time to progress (20-80% range during polling)
# Assume typical completion time is timeout_seconds or 120s default
estimated_total = timeout_seconds or 120
progress = min(80.0, 20.0 + (elapsed / estimated_total) * 60.0)
progress_callback(progress, f"Video generation in progress... ({elapsed:.0f}s)")
# Poll faster (1.0s instead of 2.0s) to match example's responsiveness
time.sleep(interval_seconds)