Files
ALwrity/backend/services/wavespeed/generators/video.py.old

1330 lines
53 KiB
Python

"""
Video generation generator for WaveSpeed API.
"""
import requests
from typing import Any, Dict, Optional, Callable
from fastapi import HTTPException
from utils.logger_utils import get_service_logger
logger = get_service_logger("wavespeed.generators.video")
class VideoGenerator:
"""Video generation generator."""
def __init__(self, api_key: str, base_url: str, polling):
"""Initialize video generator.
Args:
api_key: WaveSpeed API key
base_url: WaveSpeed API base URL
polling: WaveSpeedPolling instance for async operations
"""
self.api_key = api_key
self.base_url = base_url
self.polling = polling
def _get_headers(self) -> dict:
"""Get HTTP headers for API requests."""
return {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
}
def submit_image_to_video(
self,
model_path: str,
payload: Dict[str, Any],
timeout: int = 30,
) -> str:
"""
Submit an image-to-video generation request.
Returns the prediction ID for polling.
"""
url = f"{self.base_url}/{model_path}"
logger.info(f"[WaveSpeed] Submitting request to {url}")
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed image-to-video submission failed",
"status_code": response.status_code,
"response": response.text,
},
)
data = response.json().get("data")
if not data or "id" not in data:
logger.error(f"[WaveSpeed] Unexpected submission response: {response.text}")
raise HTTPException(
status_code=502,
detail={"error": "WaveSpeed response missing prediction id"},
)
prediction_id = data["id"]
logger.info(f"[WaveSpeed] Submitted request: {prediction_id}")
return prediction_id
def submit_text_to_video(
self,
model_path: str,
payload: Dict[str, Any],
timeout: int = 60,
) -> str:
"""
Submit a text-to-video generation request to WaveSpeed.
Args:
model_path: Model path (e.g., "alibaba/wan-2.5/text-to-video")
payload: Request payload with prompt, resolution, duration, optional audio
timeout: Request timeout in seconds
Returns:
Prediction ID for polling
"""
url = f"{self.base_url}/{model_path}"
logger.info(f"[WaveSpeed] Submitting text-to-video request to {url}")
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Text-to-video submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed text-to-video submission failed",
"status_code": response.status_code,
"response": response.text,
},
)
data = response.json().get("data")
if not data or "id" not in data:
logger.error(f"[WaveSpeed] Unexpected text-to-video response: {response.text}")
raise HTTPException(
status_code=502,
detail={"error": "WaveSpeed response missing prediction id"},
)
prediction_id = data["id"]
logger.info(f"[WaveSpeed] Submitted text-to-video request: {prediction_id}")
return prediction_id
def generate_text_video(
self,
prompt: str,
resolution: str = "720p", # 480p, 720p, 1080p
duration: int = 5, # 5 or 10 seconds
audio_base64: Optional[str] = None, # Optional audio for lip-sync
negative_prompt: Optional[str] = None,
seed: Optional[int] = None,
enable_prompt_expansion: bool = True,
enable_sync_mode: bool = False,
timeout: int = 180,
) -> Dict[str, Any]:
"""
Generate video from text prompt using WAN 2.5 text-to-video.
Args:
prompt: Text prompt describing the video
resolution: Output resolution (480p, 720p, 1080p)
duration: Video duration in seconds (5 or 10)
audio_base64: Optional audio file (wav/mp3, 3-30s, ≤15MB) for lip-sync
negative_prompt: Optional negative prompt
seed: Optional random seed for reproducibility
enable_prompt_expansion: Enable prompt optimizer
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds
Returns:
Dictionary with video bytes, metadata, and cost
"""
model_path = "alibaba/wan-2.5/text-to-video"
# Validate resolution
valid_resolutions = ["480p", "720p", "1080p"]
if resolution not in valid_resolutions:
raise HTTPException(
status_code=400,
detail=f"Invalid resolution: {resolution}. Must be one of: {valid_resolutions}"
)
# Validate duration
if duration not in [5, 10]:
raise HTTPException(
status_code=400,
detail="Duration must be 5 or 10 seconds"
)
# Build payload
payload = {
"prompt": prompt,
"resolution": resolution,
"duration": duration,
"enable_prompt_expansion": enable_prompt_expansion,
"enable_sync_mode": enable_sync_mode,
}
# Add optional audio
if audio_base64:
payload["audio"] = audio_base64
# Add optional parameters
if negative_prompt:
payload["negative_prompt"] = negative_prompt
if seed is not None:
payload["seed"] = seed
# Submit request
logger.info(
f"[WaveSpeed] Generating text-to-video: resolution={resolution}, "
f"duration={duration}s, prompt_length={len(prompt)}, sync_mode={enable_sync_mode}"
)
# For sync mode, submit and get result directly
if enable_sync_mode:
url = f"{self.base_url}/{model_path}"
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Text-to-video submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed text-to-video submission failed",
"status_code": response.status_code,
"response": response.text[:500],
},
)
response_json = response.json()
data = response_json.get("data") or response_json
# Check status - if "created" or "processing", we need to poll even in sync mode
status = data.get("status", "").lower()
outputs = data.get("outputs") or []
prediction_id = data.get("id")
logger.debug(
f"[WaveSpeed] Sync mode response: status='{status}', outputs_count={len(outputs)}, "
f"prediction_id={prediction_id}"
)
# Handle sync mode - result should be directly in outputs
if status == "completed" and outputs:
# Sync mode returned completed result - use it directly
logger.info(f"[WaveSpeed] Got immediate video results from sync mode (status: {status})")
video_url = outputs[0]
if not isinstance(video_url, str) or not video_url.startswith("http"):
logger.error(f"[WaveSpeed] Invalid video URL format in sync mode: {video_url}")
raise HTTPException(
status_code=502,
detail=f"Invalid video URL format: {video_url}",
)
video_bytes = self._download_video(video_url)
metadata = data.get("metadata") or {}
# prediction_id is already set from data.get("id") above (line 210)
else:
# Sync mode returned "created", "processing", or incomplete status - need to poll
if not prediction_id:
logger.error(
f"[WaveSpeed] Sync mode returned status '{status}' but no prediction ID. "
f"Response: {response.text[:500]}"
)
raise HTTPException(
status_code=502,
detail="WaveSpeed text-to-video sync mode returned async response without prediction ID",
)
logger.info(
f"[WaveSpeed] Sync mode returned status '{status}' with {len(outputs)} output(s). "
f"Falling back to polling (prediction_id: {prediction_id})"
)
# Poll for completion
try:
result = self.polling.poll_until_complete(
prediction_id,
timeout_seconds=timeout,
interval_seconds=2.0,
)
except HTTPException as e:
detail = e.detail or {}
if isinstance(detail, dict):
detail.setdefault("prediction_id", prediction_id)
detail.setdefault("resume_available", True)
raise HTTPException(status_code=e.status_code, detail=detail)
outputs = result.get("outputs") or []
if not outputs:
logger.error(f"[WaveSpeed] Polling completed but no outputs: {result}")
raise HTTPException(
status_code=502,
detail="WaveSpeed text-to-video completed but returned no outputs",
)
video_url = outputs[0]
if not isinstance(video_url, str) or not video_url.startswith("http"):
logger.error(f"[WaveSpeed] Invalid video URL format after polling: {video_url}")
raise HTTPException(
status_code=502,
detail=f"Invalid video URL format: {video_url}",
)
video_bytes = self._download_video(video_url)
metadata = result.get("metadata") or {}
else:
# Async mode - submit and poll
prediction_id = self.submit_text_to_video(model_path, payload, timeout=timeout)
# Poll for completion
try:
result = self.polling.poll_until_complete(
prediction_id,
timeout_seconds=timeout,
interval_seconds=2.0
)
except HTTPException as e:
detail = e.detail or {}
if isinstance(detail, dict):
detail.setdefault("prediction_id", prediction_id)
detail.setdefault("resume_available", True)
raise HTTPException(status_code=e.status_code, detail=detail)
# Extract video URL
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(
status_code=502,
detail="WAN 2.5 text-to-video completed but returned no outputs"
)
video_url = outputs[0]
if not isinstance(video_url, str) or not video_url.startswith("http"):
raise HTTPException(
status_code=502,
detail=f"Invalid video URL format: {video_url}"
)
video_bytes = self._download_video(video_url)
metadata = result.get("metadata") or {}
# prediction_id is already set from earlier in the function
# Calculate cost (same pricing as image-to-video)
pricing = {
"480p": 0.05,
"720p": 0.10,
"1080p": 0.15,
}
cost = pricing.get(resolution, 0.10) * duration
# Get video dimensions
resolution_dims = {
"480p": (854, 480),
"720p": (1280, 720),
"1080p": (1920, 1080),
}
width, height = resolution_dims.get(resolution, (1280, 720))
logger.info(
f"[WaveSpeed] ✅ Generated text-to-video: {len(video_bytes)} bytes, "
f"resolution={resolution}, duration={duration}s, cost=${cost:.2f}"
)
return {
"video_bytes": video_bytes,
"prompt": prompt,
"duration": float(duration),
"model_name": "alibaba/wan-2.5/text-to-video",
"cost": cost,
"provider": "wavespeed",
"source_video_url": video_url,
"prediction_id": prediction_id,
"resolution": resolution,
"width": width,
"height": height,
"metadata": metadata,
}
def _download_video(self, video_url: str) -> bytes:
"""Download video from URL."""
logger.info(f"[WaveSpeed] Downloading video from: {video_url}")
video_response = requests.get(video_url, timeout=180)
if video_response.status_code != 200:
raise HTTPException(
status_code=502,
detail={
"error": "Failed to download WAN 2.5 video",
"status_code": video_response.status_code,
"response": video_response.text[:200],
}
)
return video_response.content
def upscale_video(
self,
video: str, # Base64-encoded video or URL
target_resolution: str = "1080p",
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Upscale video using FlashVSR.
Args:
video: Base64-encoded video data URI or public URL
target_resolution: Target resolution ("720p", "1080p", "2k", "4k")
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300 for long videos)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Upscaled video bytes
Raises:
HTTPException: If the upscaling fails
"""
model_path = "wavespeed-ai/flashvsr"
url = f"{self.base_url}/{model_path}"
payload = {
"video": video,
"target_resolution": target_resolution,
}
logger.info(f"[WaveSpeed] Upscaling video via {url} (target={target_resolution})")
# Submit the task
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] FlashVSR submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed FlashVSR submission failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
prediction_id = data.get("id")
if not prediction_id:
logger.error(f"[WaveSpeed] No prediction ID in FlashVSR response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed FlashVSR response missing prediction id",
)
logger.info(f"[WaveSpeed] FlashVSR task submitted: {prediction_id}")
# Poll for result
result = self.polling.poll_until_complete(
prediction_id,
timeout_seconds=timeout,
interval_seconds=2.0, # Longer interval for upscaling (slower process)
progress_callback=progress_callback,
)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed FlashVSR returned no outputs")
video_url = outputs[0] if isinstance(outputs[0], str) else outputs[0].get("url")
if not video_url:
raise HTTPException(status_code=502, detail="WaveSpeed FlashVSR output format not recognized")
# Download the upscaled video
logger.info(f"[WaveSpeed] Downloading upscaled video from: {video_url}")
video_response = requests.get(video_url, timeout=timeout)
if video_response.status_code != 200:
logger.error(f"[WaveSpeed] Failed to download upscaled video: {video_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to download upscaled video from WaveSpeed",
)
video_bytes = video_response.content
logger.info(f"[WaveSpeed] Video upscaling completed successfully (size: {len(video_bytes)} bytes)")
return video_bytes
def extend_video(
self,
video: str, # Base64-encoded video or URL
prompt: str,
model: str = "wan-2.5", # "wan-2.5", "wan-2.2-spicy", or "seedance-1.5-pro"
audio: Optional[str] = None, # Optional audio URL (WAN 2.5 only)
negative_prompt: Optional[str] = None, # WAN 2.5 only
resolution: str = "720p",
duration: int = 5,
enable_prompt_expansion: bool = False, # WAN 2.5 only
generate_audio: bool = True, # Seedance 1.5 Pro only
camera_fixed: bool = False, # Seedance 1.5 Pro only
seed: Optional[int] = None,
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Extend video duration using WAN 2.5, WAN 2.2 Spicy, or Seedance 1.5 Pro video-extend.
Args:
video: Base64-encoded video data URI or public URL
prompt: Text prompt describing how to extend the video
model: Model to use ("wan-2.5", "wan-2.2-spicy", or "seedance-1.5-pro")
audio: Optional audio URL to guide generation (WAN 2.5 only)
negative_prompt: Optional negative prompt (WAN 2.5 only)
resolution: Output resolution (varies by model)
duration: Duration of extended video in seconds (varies by model)
enable_prompt_expansion: Enable prompt optimizer (WAN 2.5 only)
generate_audio: Generate audio for extended video (Seedance 1.5 Pro only)
camera_fixed: Fix camera position (Seedance 1.5 Pro only)
seed: Random seed for reproducibility (-1 for random)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Extended video bytes
Raises:
HTTPException: If the extension fails
"""
# Determine model path
if model in ("wan-2.2-spicy", "wavespeed-ai/wan-2.2-spicy/video-extend"):
model_path = "wavespeed-ai/wan-2.2-spicy/video-extend"
elif model in ("seedance-1.5-pro", "bytedance/seedance-v1.5-pro/video-extend"):
model_path = "bytedance/seedance-v1.5-pro/video-extend"
else:
# Default to WAN 2.5
model_path = "alibaba/wan-2.5/video-extend"
url = f"{self.base_url}/{model_path}"
# Base payload (common to all models)
payload = {
"video": video,
"prompt": prompt,
"resolution": resolution,
"duration": duration,
}
# Model-specific parameters
if model_path == "alibaba/wan-2.5/video-extend":
# WAN 2.5 specific
payload["enable_prompt_expansion"] = enable_prompt_expansion
if audio:
payload["audio"] = audio
if negative_prompt:
payload["negative_prompt"] = negative_prompt
elif model_path == "bytedance/seedance-v1.5-pro/video-extend":
# Seedance 1.5 Pro specific
payload["generate_audio"] = generate_audio
payload["camera_fixed"] = camera_fixed
# Seed (all models support it)
if seed is not None:
payload["seed"] = seed
logger.info(f"[WaveSpeed] Extending video via {url} (duration={duration}s, resolution={resolution})")
# Submit the task
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Video extend submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed video extend submission failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
prediction_id = data.get("id")
if not prediction_id:
logger.error(f"[WaveSpeed] No prediction ID in video extend response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed video extend response missing prediction id",
)
logger.info(f"[WaveSpeed] Video extend task submitted: {prediction_id}")
# Poll for result
result = self.polling.poll_until_complete(
prediction_id,
timeout_seconds=timeout,
interval_seconds=2.0,
progress_callback=progress_callback,
)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed video extend returned no outputs")
# Handle outputs - can be array of strings or array of objects
video_url = None
if isinstance(outputs[0], str):
video_url = outputs[0]
elif isinstance(outputs[0], dict):
video_url = outputs[0].get("url") or outputs[0].get("video_url")
if not video_url:
raise HTTPException(status_code=502, detail="WaveSpeed video extend output format not recognized")
# Download the extended video
logger.info(f"[WaveSpeed] Downloading extended video from: {video_url}")
video_response = requests.get(video_url, timeout=timeout)
if video_response.status_code != 200:
logger.error(f"[WaveSpeed] Failed to download extended video: {video_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to download extended video from WaveSpeed",
)
video_bytes = video_response.content
logger.info(f"[WaveSpeed] Video extension completed successfully (size: {len(video_bytes)} bytes)")
return video_bytes
def face_swap(
self,
image: str, # Base64-encoded image or URL
video: str, # Base64-encoded video or URL
prompt: Optional[str] = None,
resolution: str = "480p",
seed: Optional[int] = None,
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Perform face/character swap using MoCha (wavespeed-ai/wan-2.1/mocha).
Args:
image: Base64-encoded image data URI or public URL (reference character)
video: Base64-encoded video data URI or public URL (source video)
prompt: Optional prompt to guide the swap
resolution: Output resolution ("480p" or "720p")
seed: Random seed for reproducibility (-1 for random)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Face-swapped video bytes
Raises:
HTTPException: If the face swap fails
"""
model_path = "wavespeed-ai/wan-2.1/mocha"
url = f"{self.base_url}/{model_path}"
# Build payload
payload = {
"image": image,
"video": video,
}
if prompt:
payload["prompt"] = prompt
if resolution in ("480p", "720p"):
payload["resolution"] = resolution
else:
payload["resolution"] = "480p" # Default
if seed is not None:
payload["seed"] = seed
else:
payload["seed"] = -1 # Random seed
logger.info(
f"[WaveSpeed] Face swap request via {url} "
f"(resolution={payload['resolution']}, seed={payload['seed']})"
)
# Submit the task
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Face swap submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed face swap submission failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
if not data or "id" not in data:
logger.error(f"[WaveSpeed] Unexpected face swap response: {response.text}")
raise HTTPException(
status_code=502,
detail={"error": "WaveSpeed response missing prediction id"},
)
prediction_id = data["id"]
logger.info(f"[WaveSpeed] Face swap submitted: {prediction_id}")
if enable_sync_mode:
# Poll until complete
result = self.polling.poll_until_complete(
prediction_id,
timeout_seconds=timeout,
interval_seconds=2.0,
progress_callback=progress_callback,
)
# Extract video URL from result
outputs = result.get("outputs", [])
if not outputs:
raise HTTPException(
status_code=502,
detail={"error": "Face swap completed but no output video found"},
)
# Handle outputs - can be array of strings or array of objects
video_url = None
if isinstance(outputs[0], str):
video_url = outputs[0]
elif isinstance(outputs[0], dict):
video_url = outputs[0].get("url") or outputs[0].get("video_url")
if not video_url:
raise HTTPException(
status_code=502,
detail={"error": "Face swap output format not recognized"},
)
# Download video
logger.info(f"[WaveSpeed] Downloading face-swapped video from: {video_url}")
video_response = requests.get(video_url, timeout=timeout)
if video_response.status_code != 200:
raise HTTPException(
status_code=502,
detail={"error": f"Failed to download face-swapped video: {video_response.status_code}"},
)
video_bytes = video_response.content
logger.info(f"[WaveSpeed] Face swap completed: {len(video_bytes)} bytes")
return video_bytes
else:
# Return prediction ID for async polling
raise HTTPException(
status_code=501,
detail={
"error": "Async mode not yet implemented for face swap",
"prediction_id": prediction_id,
},
)
def video_face_swap(
self,
video: str, # Base64-encoded video or URL
face_image: str, # Base64-encoded image or URL
target_gender: str = "all",
target_index: int = 0,
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Perform face swap using Video Face Swap (wavespeed-ai/video-face-swap).
Args:
video: Base64-encoded video data URI or public URL (source video)
face_image: Base64-encoded image data URI or public URL (reference face)
target_gender: Filter which faces to swap ("all", "female", "male")
target_index: Select which face to swap (0 = largest, 1 = second largest, etc.)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Face-swapped video bytes
Raises:
HTTPException: If the face swap fails
"""
model_path = "wavespeed-ai/video-face-swap"
url = f"{self.base_url}/{model_path}"
# Build payload
payload = {
"video": video,
"face_image": face_image,
}
if target_gender in ("all", "female", "male"):
payload["target_gender"] = target_gender
else:
payload["target_gender"] = "all" # Default
if 0 <= target_index <= 10:
payload["target_index"] = target_index
else:
payload["target_index"] = 0 # Default
logger.info(
f"[WaveSpeed] Video face swap request via {url} "
f"(target_gender={payload['target_gender']}, target_index={payload['target_index']})"
)
# Submit the task
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Video face swap submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed video face swap submission failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
if not data or "id" not in data:
logger.error(f"[WaveSpeed] Unexpected video face swap response: {response.text}")
raise HTTPException(
status_code=502,
detail={"error": "WaveSpeed response missing prediction id"},
)
prediction_id = data["id"]
logger.info(f"[WaveSpeed] Video face swap submitted: {prediction_id}")
if enable_sync_mode:
# Poll until complete
result = self.polling.poll_until_complete(
prediction_id,
timeout_seconds=timeout,
interval_seconds=2.0,
progress_callback=progress_callback,
)
# Extract video URL from result
outputs = result.get("outputs", [])
if not outputs:
raise HTTPException(
status_code=502,
detail={"error": "Video face swap completed but no output video found"},
)
# Handle outputs - can be array of strings or array of objects
video_url = None
if isinstance(outputs[0], str):
video_url = outputs[0]
elif isinstance(outputs[0], dict):
video_url = outputs[0].get("url") or outputs[0].get("video_url")
if not video_url:
raise HTTPException(
status_code=502,
detail={"error": "Video face swap output format not recognized"},
)
# Download video
logger.info(f"[WaveSpeed] Downloading face-swapped video from: {video_url}")
video_response = requests.get(video_url, timeout=timeout)
if video_response.status_code != 200:
raise HTTPException(
status_code=502,
detail={"error": f"Failed to download face-swapped video: {video_response.status_code}"},
)
video_bytes = video_response.content
logger.info(f"[WaveSpeed] Video face swap completed: {len(video_bytes)} bytes")
return video_bytes
else:
# Return prediction ID for async polling
raise HTTPException(
status_code=501,
detail={
"error": "Async mode not yet implemented for video face swap",
"prediction_id": prediction_id,
},
)
def video_translate(
self,
video: str, # Base64-encoded video or URL
output_language: str = "English",
enable_sync_mode: bool = False,
timeout: int = 600,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Translate video to target language using HeyGen Video Translate.
Args:
video: Base64-encoded video data URI or public URL (source video)
output_language: Target language for translation (default: "English")
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 600)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Translated video bytes
Raises:
HTTPException: If the video translation fails
"""
model_path = "heygen/video-translate"
url = f"{self.base_url}/{model_path}"
# Build payload
payload = {
"video": video,
"output_language": output_language,
}
logger.info(
f"[WaveSpeed] Video translate request via {url} "
f"(output_language={output_language})"
)
# Submit the task
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Video translate submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed video translate submission failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
if not data or "id" not in data:
logger.error(f"[WaveSpeed] Unexpected video translate response: {response.text}")
raise HTTPException(
status_code=502,
detail={"error": "WaveSpeed response missing prediction id"},
)
prediction_id = data["id"]
logger.info(f"[WaveSpeed] Video translate submitted: {prediction_id}")
if enable_sync_mode:
# Poll until complete
result = self.polling.poll_until_complete(
prediction_id,
timeout_seconds=timeout,
interval_seconds=2.0,
progress_callback=progress_callback,
)
# Extract video URL from result
outputs = result.get("outputs", [])
if not outputs:
raise HTTPException(
status_code=502,
detail={"error": "Video translate completed but no output video found"},
)
# Handle outputs - can be array of strings or array of objects
video_url = None
if isinstance(outputs[0], str):
video_url = outputs[0]
elif isinstance(outputs[0], dict):
video_url = outputs[0].get("url") or outputs[0].get("video_url")
if not video_url:
raise HTTPException(
status_code=502,
detail={"error": "Video translate output format not recognized"},
)
# Download video
logger.info(f"[WaveSpeed] Downloading translated video from: {video_url}")
video_response = requests.get(video_url, timeout=timeout)
if video_response.status_code != 200:
raise HTTPException(
status_code=502,
detail={"error": f"Failed to download translated video: {video_response.status_code}"},
)
video_bytes = video_response.content
logger.info(f"[WaveSpeed] Video translate completed: {len(video_bytes)} bytes")
return video_bytes
else:
# Return prediction ID for async polling
raise HTTPException(
status_code=501,
detail={
"error": "Async mode not yet implemented for video translate",
"prediction_id": prediction_id,
},
)
def remove_background(
self,
video: str, # Base64-encoded video or URL
background_image: Optional[str] = None, # Base64-encoded image or URL (optional)
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Remove or replace video background using Video Background Remover.
Args:
video: Base64-encoded video data URI or public URL (source video)
background_image: Optional base64-encoded image data URI or public URL (replacement background)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Video with background removed/replaced
Raises:
HTTPException: If the background removal fails
"""
model_path = "wavespeed-ai/video-background-remover"
url = f"{self.base_url}/{model_path}"
# Build payload
payload = {
"video": video,
}
if background_image:
payload["background_image"] = background_image
logger.info(
f"[WaveSpeed] Video background removal request via {url} "
f"(has_background={background_image is not None})"
)
# Submit the task
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Video background removal submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed video background removal submission failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
prediction_id = data.get("id")
if not prediction_id:
logger.error(f"[WaveSpeed] No prediction ID in video background removal response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed video background removal response missing prediction id",
)
logger.info(f"[WaveSpeed] Video background removal task submitted: {prediction_id}")
if enable_sync_mode:
result = self.polling.poll_until_complete(
prediction_id,
timeout_seconds=timeout,
interval_seconds=2.0,
progress_callback=progress_callback,
)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed video background removal returned no outputs")
video_url = None
if isinstance(outputs[0], str):
video_url = outputs[0]
elif isinstance(outputs[0], dict):
video_url = outputs[0].get("url") or outputs[0].get("video_url")
if not video_url:
raise HTTPException(status_code=502, detail="WaveSpeed video background removal output format not recognized")
logger.info(f"[WaveSpeed] Downloading processed video from: {video_url}")
video_response = requests.get(video_url, timeout=timeout)
if video_response.status_code != 200:
logger.error(f"[WaveSpeed] Failed to download processed video: {video_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to download processed video from WaveSpeed",
)
video_bytes = video_response.content
logger.info(f"[WaveSpeed] Video background removal completed successfully (size: {len(video_bytes)} bytes)")
return video_bytes
else:
raise HTTPException(
status_code=501,
detail={
"error": "Async mode not yet implemented for video background removal",
"prediction_id": prediction_id,
},
)
def hunyuan_video_foley(
self,
video: str, # Base64-encoded video or URL
prompt: Optional[str] = None, # Optional text prompt describing desired sounds
seed: int = -1, # Random seed (-1 for random)
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Generate realistic Foley and ambient audio from video using Hunyuan Video Foley.
Args:
video: Base64-encoded video data URI or public URL (source video)
prompt: Optional text prompt describing desired sounds (e.g., "ocean waves, seagulls")
seed: Random seed for reproducibility (-1 for random)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Video with generated audio
Raises:
HTTPException: If the audio generation fails
"""
model_path = "wavespeed-ai/hunyuan-video-foley"
url = f"{self.base_url}/{model_path}"
# Build payload
payload = {
"video": video,
"seed": seed,
}
if prompt:
payload["prompt"] = prompt
logger.info(
f"[WaveSpeed] Hunyuan Video Foley request via {url} "
f"(has_prompt={prompt is not None}, seed={seed})"
)
# Submit the task
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Hunyuan Video Foley submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed Hunyuan Video Foley submission failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
prediction_id = data.get("id")
if not prediction_id:
logger.error(f"[WaveSpeed] No prediction ID in Hunyuan Video Foley response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed Hunyuan Video Foley response missing prediction id",
)
logger.info(f"[WaveSpeed] Hunyuan Video Foley task submitted: {prediction_id}")
if enable_sync_mode:
result = self.polling.poll_until_complete(
prediction_id,
timeout_seconds=timeout,
interval_seconds=2.0,
progress_callback=progress_callback,
)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed Hunyuan Video Foley returned no outputs")
video_url = None
if isinstance(outputs[0], str):
video_url = outputs[0]
elif isinstance(outputs[0], dict):
video_url = outputs[0].get("url") or outputs[0].get("video_url")
if not video_url:
raise HTTPException(status_code=502, detail="WaveSpeed Hunyuan Video Foley output format not recognized")
logger.info(f"[WaveSpeed] Downloading video with audio from: {video_url}")
video_response = requests.get(video_url, timeout=timeout)
if video_response.status_code != 200:
logger.error(f"[WaveSpeed] Failed to download video with audio: {video_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to download video with audio from WaveSpeed",
)
video_bytes = video_response.content
logger.info(f"[WaveSpeed] Hunyuan Video Foley completed successfully (size: {len(video_bytes)} bytes)")
return video_bytes
else:
raise HTTPException(
status_code=501,
detail={
"error": "Async mode not yet implemented for Hunyuan Video Foley",
"prediction_id": prediction_id,
},
)
def think_sound(
self,
video: str, # Base64-encoded video or URL
prompt: Optional[str] = None, # Optional text prompt describing desired sounds
seed: int = -1, # Random seed (-1 for random)
enable_sync_mode: bool = False,
timeout: int = 300,
progress_callback: Optional[Callable[[float, str], None]] = None,
) -> bytes:
"""
Generate realistic sound effects and audio tracks from video using Think Sound.
Args:
video: Base64-encoded video data URI or public URL (source video)
prompt: Optional text prompt describing desired sounds (e.g., "engine roaring, footsteps on gravel")
seed: Random seed for reproducibility (-1 for random)
enable_sync_mode: If True, wait for result and return it directly
timeout: Request timeout in seconds (default: 300)
progress_callback: Optional callback function(progress: float, message: str) for progress updates
Returns:
bytes: Video with generated audio
Raises:
HTTPException: If the audio generation fails
"""
model_path = "wavespeed-ai/think-sound"
url = f"{self.base_url}/{model_path}"
# Build payload
payload = {
"video": video,
"seed": seed,
}
if prompt:
payload["prompt"] = prompt
logger.info(
f"[WaveSpeed] Think Sound request via {url} "
f"(has_prompt={prompt is not None}, seed={seed})"
)
# Submit the task
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Think Sound submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed Think Sound submission failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
prediction_id = data.get("id")
if not prediction_id:
logger.error(f"[WaveSpeed] No prediction ID in Think Sound response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed Think Sound response missing prediction id",
)
logger.info(f"[WaveSpeed] Think Sound task submitted: {prediction_id}")
if enable_sync_mode:
result = self.polling.poll_until_complete(
prediction_id,
timeout_seconds=timeout,
interval_seconds=2.0,
progress_callback=progress_callback,
)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed Think Sound returned no outputs")
video_url = None
if isinstance(outputs[0], str):
video_url = outputs[0]
elif isinstance(outputs[0], dict):
video_url = outputs[0].get("url") or outputs[0].get("video_url")
if not video_url:
raise HTTPException(status_code=502, detail="WaveSpeed Think Sound output format not recognized")
logger.info(f"[WaveSpeed] Downloading video with audio from: {video_url}")
video_response = requests.get(video_url, timeout=timeout)
if video_response.status_code != 200:
logger.error(f"[WaveSpeed] Failed to download video with audio: {video_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to download video with audio from WaveSpeed",
)
video_bytes = video_response.content
logger.info(f"[WaveSpeed] Think Sound completed successfully (size: {len(video_bytes)} bytes)")
return video_bytes
else:
raise HTTPException(
status_code=501,
detail={
"error": "Async mode not yet implemented for Think Sound",
"prediction_id": prediction_id,
},
)