- Add ALWRITY_ENABLED_FEATURES env var for feature gating - Podcast-only mode: skip LLM bootstrap, scheduler, persona services - Enhance video generation prompt with scene context, analysis, narration - Add voice cloning support via custom_voice_id in WaveSpeed - Add text-to-speech for research results (browser speechSynthesis) - Fix render queue to sync images from script phase - Add WaveSpeed LLM pricing (gpt-oss-120b) - Fix podcast bible generation error handling - Refactor RouterManager for feature-based router loading
231 lines
8.0 KiB
Python
231 lines
8.0 KiB
Python
from __future__ import annotations
|
|
|
|
import base64
|
|
from typing import Any, Dict, Optional
|
|
|
|
import requests
|
|
from fastapi import HTTPException
|
|
from loguru import logger
|
|
|
|
from .client import WaveSpeedClient
|
|
|
|
INFINITALK_MODEL_PATH = "wavespeed-ai/infinitetalk"
|
|
INFINITALK_MODEL_NAME = "wavespeed-ai/infinitetalk"
|
|
INFINITALK_DEFAULT_COST = 0.30 # $0.30 per 5 seconds at 720p tier
|
|
MAX_IMAGE_BYTES = 10 * 1024 * 1024 # 10MB
|
|
MAX_AUDIO_BYTES = 50 * 1024 * 1024 # 50MB safety cap
|
|
|
|
|
|
def _as_data_uri(content_bytes: bytes, mime_type: str) -> str:
|
|
encoded = base64.b64encode(content_bytes).decode("utf-8")
|
|
return f"data:{mime_type};base64,{encoded}"
|
|
|
|
|
|
def _generate_simple_infinitetalk_prompt(
|
|
scene_data: Dict[str, Any],
|
|
story_context: Dict[str, Any],
|
|
) -> Optional[str]:
|
|
"""
|
|
Generate an enhanced prompt for InfiniteTalk video generation.
|
|
Includes scene content, analysis, bible context, and visual elements.
|
|
|
|
Returns None if no meaningful prompt can be generated.
|
|
"""
|
|
title = (scene_data.get("title") or "").strip()
|
|
description = (scene_data.get("description") or "").strip()
|
|
image_prompt = (scene_data.get("image_prompt") or "").strip()
|
|
lines = scene_data.get("lines", [])
|
|
narration = ""
|
|
if lines:
|
|
# Combine first few lines for context
|
|
narration = " ".join([str(l.get("text", "")) for l in lines[:3]])[:150]
|
|
|
|
# Build enhanced prompt with multiple context sources
|
|
parts = []
|
|
|
|
# Add main scene title
|
|
if title and len(title) > 5 and title.lower() not in ("scene", "podcast", "episode"):
|
|
parts.append(title)
|
|
|
|
# Add analysis context
|
|
analysis = story_context.get("analysis", {})
|
|
if analysis:
|
|
content_type = analysis.get("content_type")
|
|
if content_type:
|
|
parts.append(f"Content type: {content_type}")
|
|
|
|
# Add key takeaways if available
|
|
key_takeaways = analysis.get("keyTakeaways", [])
|
|
if key_takeaways and isinstance(key_takeaways, list) and len(key_takeaways) > 0:
|
|
takeaway = str(key_takeaways[0])[:80]
|
|
if takeaway:
|
|
parts.append(f"Key insight: {takeaway}")
|
|
|
|
# Audience
|
|
audience = analysis.get("audience")
|
|
if audience:
|
|
short_audience = " ".join(audience.split()[:3])
|
|
parts.append(f"Target audience: {short_audience}")
|
|
|
|
# Guest info
|
|
guest_name = analysis.get("guestName")
|
|
guest_expertise = analysis.get("guestExpertise")
|
|
if guest_name:
|
|
parts.append(f"Guest: {guest_name}")
|
|
if guest_expertise:
|
|
parts.append(f"Expertise: {guest_expertise}")
|
|
|
|
# Add bible context
|
|
bible = story_context.get("bible", {})
|
|
if bible:
|
|
host_persona = bible.get("host_persona")
|
|
tone = bible.get("tone")
|
|
visual_style = bible.get("visual_style")
|
|
background = bible.get("background")
|
|
|
|
if host_persona:
|
|
parts.append(f"Host persona: {host_persona}")
|
|
if tone:
|
|
parts.append(f"Tone: {tone}")
|
|
if visual_style:
|
|
parts.append(f"Visual style: {visual_style}")
|
|
if background:
|
|
parts.append(f"Background: {background}")
|
|
|
|
# Add original image prompt as fallback context
|
|
if image_prompt and len(parts) < 3:
|
|
img_part = image_prompt.split('.')[0][:100].strip()
|
|
if img_part:
|
|
parts.append(f"Visual context: {img_part}")
|
|
|
|
# Add narration snippet if available
|
|
if narration and len(parts) < 4:
|
|
parts.append(f"Discussing: {narration}")
|
|
|
|
if not parts:
|
|
return None
|
|
|
|
# Build prompt with visual quality keywords
|
|
quality_keywords = "Cinematic lighting, high detail, 4k quality, smooth motion"
|
|
|
|
# Combine parts into final prompt
|
|
prompt = f"{'. '.join(parts)}. {quality_keywords}. With subtle natural movement."
|
|
|
|
# Allow more room for detailed prompts - max 350 characters
|
|
prompt = prompt[:350].strip()
|
|
|
|
# Clean up trailing punctuation
|
|
if prompt.endswith(',') or prompt.endswith('.'):
|
|
prompt = prompt[:-1].strip()
|
|
|
|
return prompt if len(prompt) >= 15 else None
|
|
|
|
|
|
def animate_scene_with_voiceover(
|
|
*,
|
|
image_bytes: bytes,
|
|
audio_bytes: bytes,
|
|
scene_data: Dict[str, Any],
|
|
story_context: Dict[str, Any],
|
|
user_id: str,
|
|
resolution: str = "720p",
|
|
prompt_override: Optional[str] = None,
|
|
mask_image_bytes: Optional[bytes] = None,
|
|
seed: Optional[int] = -1,
|
|
image_mime: str = "image/png",
|
|
audio_mime: str = "audio/mpeg",
|
|
client: Optional[WaveSpeedClient] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Animate a scene image with narration audio using WaveSpeed InfiniteTalk.
|
|
Returns dict with video bytes, prompt used, model name, and cost.
|
|
"""
|
|
|
|
if not image_bytes:
|
|
raise HTTPException(status_code=404, detail="Scene image bytes missing for animation.")
|
|
if not audio_bytes:
|
|
raise HTTPException(status_code=404, detail="Scene audio bytes missing for animation.")
|
|
|
|
if len(image_bytes) > MAX_IMAGE_BYTES:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Scene image exceeds 10MB limit required by WaveSpeed InfiniteTalk.",
|
|
)
|
|
if len(audio_bytes) > MAX_AUDIO_BYTES:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Scene audio exceeds 50MB limit allowed for InfiniteTalk requests.",
|
|
)
|
|
|
|
if resolution not in {"480p", "720p"}:
|
|
raise HTTPException(status_code=400, detail="Resolution must be '480p' or '720p'.")
|
|
|
|
# Generate simple, concise prompt for InfiniteTalk (audio-driven, less need for elaborate descriptions)
|
|
animation_prompt = prompt_override or _generate_simple_infinitetalk_prompt(scene_data, story_context)
|
|
|
|
payload: Dict[str, Any] = {
|
|
"image": _as_data_uri(image_bytes, image_mime),
|
|
"audio": _as_data_uri(audio_bytes, audio_mime),
|
|
"resolution": resolution,
|
|
}
|
|
# Only include prompt if we have a meaningful one (InfiniteTalk works fine without it)
|
|
if animation_prompt:
|
|
payload["prompt"] = animation_prompt
|
|
if mask_image_bytes:
|
|
payload["mask_image"] = _as_data_uri(mask_image_bytes, image_mime)
|
|
if seed is not None:
|
|
payload["seed"] = seed
|
|
|
|
client = client or WaveSpeedClient()
|
|
prediction_id = client.submit_image_to_video(INFINITALK_MODEL_PATH, payload, timeout=60)
|
|
|
|
try:
|
|
# Poll faster (0.5s) to mirror reference pattern; allow up to 10 minutes
|
|
result = client.poll_until_complete(prediction_id, timeout_seconds=600, interval_seconds=0.5)
|
|
except HTTPException as exc:
|
|
detail = exc.detail or {}
|
|
if isinstance(detail, dict):
|
|
detail.setdefault("prediction_id", prediction_id)
|
|
detail.setdefault("resume_available", True)
|
|
raise
|
|
|
|
outputs = result.get("outputs") or []
|
|
if not outputs:
|
|
raise HTTPException(status_code=502, detail="WaveSpeed InfiniteTalk completed but returned no outputs.")
|
|
|
|
video_url = outputs[0]
|
|
video_response = requests.get(video_url, timeout=180)
|
|
if video_response.status_code != 200:
|
|
raise HTTPException(
|
|
status_code=502,
|
|
detail={
|
|
"error": "Failed to download InfiniteTalk video",
|
|
"status_code": video_response.status_code,
|
|
"response": video_response.text[:200],
|
|
},
|
|
)
|
|
|
|
metadata = result.get("metadata") or {}
|
|
duration = metadata.get("duration_seconds") or metadata.get("duration") or 0
|
|
|
|
logger.info(
|
|
"[InfiniteTalk] Generated talking avatar video user=%s scene=%s resolution=%s size=%s bytes",
|
|
user_id,
|
|
scene_data.get("scene_number"),
|
|
resolution,
|
|
len(video_response.content),
|
|
)
|
|
|
|
return {
|
|
"video_bytes": video_response.content,
|
|
"prompt": animation_prompt,
|
|
"duration": duration or 5,
|
|
"model_name": INFINITALK_MODEL_NAME,
|
|
"cost": INFINITALK_DEFAULT_COST,
|
|
"provider": "wavespeed",
|
|
"source_video_url": video_url,
|
|
"prediction_id": prediction_id,
|
|
}
|
|
|
|
|