245 lines
9.5 KiB
Python
245 lines
9.5 KiB
Python
"""
|
|
Video audio generation operations.
|
|
"""
|
|
|
|
import requests
|
|
from typing import Optional, Callable
|
|
from fastapi import HTTPException
|
|
|
|
from utils.logger_utils import get_service_logger
|
|
from .base import VideoBase
|
|
|
|
logger = get_service_logger("wavespeed.generators.video.audio")
|
|
|
|
|
|
class VideoAudio(VideoBase):
|
|
"""Video audio generation operations."""
|
|
|
|
def hunyuan_video_foley(
|
|
self,
|
|
video: str, # Base64-encoded video or URL
|
|
prompt: Optional[str] = None, # Optional text prompt describing desired sounds
|
|
seed: int = -1, # Random seed (-1 for random)
|
|
enable_sync_mode: bool = False,
|
|
timeout: int = 300,
|
|
progress_callback: Optional[Callable[[float, str], None]] = None,
|
|
) -> bytes:
|
|
"""
|
|
Generate realistic Foley and ambient audio from video using Hunyuan Video Foley.
|
|
|
|
Args:
|
|
video: Base64-encoded video data URI or public URL (source video)
|
|
prompt: Optional text prompt describing desired sounds (e.g., "ocean waves, seagulls")
|
|
seed: Random seed for reproducibility (-1 for random)
|
|
enable_sync_mode: If True, wait for result and return it directly
|
|
timeout: Request timeout in seconds (default: 300)
|
|
progress_callback: Optional callback function(progress: float, message: str) for progress updates
|
|
|
|
Returns:
|
|
bytes: Video with generated audio
|
|
|
|
Raises:
|
|
HTTPException: If the audio generation fails
|
|
"""
|
|
model_path = "wavespeed-ai/hunyuan-video-foley"
|
|
url = f"{self.base_url}/{model_path}"
|
|
|
|
# Build payload
|
|
payload = {
|
|
"video": video,
|
|
"seed": seed,
|
|
}
|
|
|
|
if prompt:
|
|
payload["prompt"] = prompt
|
|
|
|
logger.info(
|
|
f"[WaveSpeed] Hunyuan Video Foley request via {url} "
|
|
f"(has_prompt={prompt is not None}, seed={seed})"
|
|
)
|
|
|
|
# Submit the task
|
|
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
|
|
|
|
if response.status_code != 200:
|
|
logger.error(f"[WaveSpeed] Hunyuan Video Foley submission failed: {response.status_code} {response.text}")
|
|
raise HTTPException(
|
|
status_code=502,
|
|
detail={
|
|
"error": "WaveSpeed Hunyuan Video Foley submission failed",
|
|
"status_code": response.status_code,
|
|
"response": response.text,
|
|
},
|
|
)
|
|
|
|
response_json = response.json()
|
|
data = response_json.get("data") or response_json
|
|
prediction_id = data.get("id")
|
|
|
|
if not prediction_id:
|
|
logger.error(f"[WaveSpeed] No prediction ID in Hunyuan Video Foley response: {response.text}")
|
|
raise HTTPException(
|
|
status_code=502,
|
|
detail="WaveSpeed Hunyuan Video Foley response missing prediction id",
|
|
)
|
|
|
|
logger.info(f"[WaveSpeed] Hunyuan Video Foley task submitted: {prediction_id}")
|
|
|
|
if enable_sync_mode:
|
|
result = self.polling.poll_until_complete(
|
|
prediction_id,
|
|
timeout_seconds=timeout,
|
|
interval_seconds=2.0,
|
|
progress_callback=progress_callback,
|
|
)
|
|
|
|
outputs = result.get("outputs") or []
|
|
if not outputs:
|
|
raise HTTPException(status_code=502, detail="WaveSpeed Hunyuan Video Foley returned no outputs")
|
|
|
|
video_url = None
|
|
if isinstance(outputs[0], str):
|
|
video_url = outputs[0]
|
|
elif isinstance(outputs[0], dict):
|
|
video_url = outputs[0].get("url") or outputs[0].get("video_url")
|
|
|
|
if not video_url:
|
|
raise HTTPException(status_code=502, detail="WaveSpeed Hunyuan Video Foley output format not recognized")
|
|
|
|
logger.info(f"[WaveSpeed] Downloading video with audio from: {video_url}")
|
|
video_response = requests.get(video_url, timeout=timeout)
|
|
|
|
if video_response.status_code != 200:
|
|
logger.error(f"[WaveSpeed] Failed to download video with audio: {video_response.status_code}")
|
|
raise HTTPException(
|
|
status_code=502,
|
|
detail="Failed to download video with audio from WaveSpeed",
|
|
)
|
|
|
|
video_bytes = video_response.content
|
|
logger.info(f"[WaveSpeed] Hunyuan Video Foley completed successfully (size: {len(video_bytes)} bytes)")
|
|
|
|
return video_bytes
|
|
else:
|
|
raise HTTPException(
|
|
status_code=501,
|
|
detail={
|
|
"error": "Async mode not yet implemented for Hunyuan Video Foley",
|
|
"prediction_id": prediction_id,
|
|
},
|
|
)
|
|
|
|
def think_sound(
|
|
self,
|
|
video: str, # Base64-encoded video or URL
|
|
prompt: Optional[str] = None, # Optional text prompt describing desired sounds
|
|
seed: int = -1, # Random seed (-1 for random)
|
|
enable_sync_mode: bool = False,
|
|
timeout: int = 300,
|
|
progress_callback: Optional[Callable[[float, str], None]] = None,
|
|
) -> bytes:
|
|
"""
|
|
Generate realistic sound effects and audio tracks from video using Think Sound.
|
|
|
|
Args:
|
|
video: Base64-encoded video data URI or public URL (source video)
|
|
prompt: Optional text prompt describing desired sounds (e.g., "engine roaring, footsteps on gravel")
|
|
seed: Random seed for reproducibility (-1 for random)
|
|
enable_sync_mode: If True, wait for result and return it directly
|
|
timeout: Request timeout in seconds (default: 300)
|
|
progress_callback: Optional callback function(progress: float, message: str) for progress updates
|
|
|
|
Returns:
|
|
bytes: Video with generated audio
|
|
|
|
Raises:
|
|
HTTPException: If the audio generation fails
|
|
"""
|
|
model_path = "wavespeed-ai/think-sound"
|
|
url = f"{self.base_url}/{model_path}"
|
|
|
|
# Build payload
|
|
payload = {
|
|
"video": video,
|
|
"seed": seed,
|
|
}
|
|
|
|
if prompt:
|
|
payload["prompt"] = prompt
|
|
|
|
logger.info(
|
|
f"[WaveSpeed] Think Sound request via {url} "
|
|
f"(has_prompt={prompt is not None}, seed={seed})"
|
|
)
|
|
|
|
# Submit the task
|
|
response = requests.post(url, headers=self._get_headers(), json=payload, timeout=timeout)
|
|
|
|
if response.status_code != 200:
|
|
logger.error(f"[WaveSpeed] Think Sound submission failed: {response.status_code} {response.text}")
|
|
raise HTTPException(
|
|
status_code=502,
|
|
detail={
|
|
"error": "WaveSpeed Think Sound submission failed",
|
|
"status_code": response.status_code,
|
|
"response": response.text,
|
|
},
|
|
)
|
|
|
|
response_json = response.json()
|
|
data = response_json.get("data") or response_json
|
|
prediction_id = data.get("id")
|
|
|
|
if not prediction_id:
|
|
logger.error(f"[WaveSpeed] No prediction ID in Think Sound response: {response.text}")
|
|
raise HTTPException(
|
|
status_code=502,
|
|
detail="WaveSpeed Think Sound response missing prediction id",
|
|
)
|
|
|
|
logger.info(f"[WaveSpeed] Think Sound task submitted: {prediction_id}")
|
|
|
|
if enable_sync_mode:
|
|
result = self.polling.poll_until_complete(
|
|
prediction_id,
|
|
timeout_seconds=timeout,
|
|
interval_seconds=2.0,
|
|
progress_callback=progress_callback,
|
|
)
|
|
|
|
outputs = result.get("outputs") or []
|
|
if not outputs:
|
|
raise HTTPException(status_code=502, detail="WaveSpeed Think Sound returned no outputs")
|
|
|
|
video_url = None
|
|
if isinstance(outputs[0], str):
|
|
video_url = outputs[0]
|
|
elif isinstance(outputs[0], dict):
|
|
video_url = outputs[0].get("url") or outputs[0].get("video_url")
|
|
|
|
if not video_url:
|
|
raise HTTPException(status_code=502, detail="WaveSpeed Think Sound output format not recognized")
|
|
|
|
logger.info(f"[WaveSpeed] Downloading video with audio from: {video_url}")
|
|
video_response = requests.get(video_url, timeout=timeout)
|
|
|
|
if video_response.status_code != 200:
|
|
logger.error(f"[WaveSpeed] Failed to download video with audio: {video_response.status_code}")
|
|
raise HTTPException(
|
|
status_code=502,
|
|
detail="Failed to download video with audio from WaveSpeed",
|
|
)
|
|
|
|
video_bytes = video_response.content
|
|
logger.info(f"[WaveSpeed] Think Sound completed successfully (size: {len(video_bytes)} bytes)")
|
|
|
|
return video_bytes
|
|
else:
|
|
raise HTTPException(
|
|
status_code=501,
|
|
detail={
|
|
"error": "Async mode not yet implemented for Think Sound",
|
|
"prediction_id": prediction_id,
|
|
},
|
|
)
|