feat: voice clone audio generation + podcast workspace architecture
- Voice clone integration: When user selects voice clone in Write phase, backend uses their uploaded voice sample + scene script text to generate audio via qwen3/minimax/cosyvoice voice clone APIs - Multi-tenant workspace storage: All podcast assets (audio, video, images, charts) now use workspace-specific directories per user - Chart preview improvements: Card-based B-Roll charts UI with thumbnails, takeaway text, and action buttons; public endpoint for image serving - Voice clone caching: In-memory LRU cache for voice samples (avoids re-downloading per scene); frontend caches voice clone metadata - Thread pool for voice clone: Audio generation uses ThreadPoolExecutor to avoid blocking the FastAPI event loop - Auto-detect voice clone IDs (vc_*, MY_VOICE_CLONE) to route correctly - DB fallback for voice sample URL: Fetches from ContentAsset if not passed - Fixed API URL resolution for chart previews - Fixed GlassyCard DOM warnings for motion props - Fixed ScriptGenerationProgressView syntax error - Fixed usePodcastWorkflow scriptData reference
This commit is contained in:
@@ -12,7 +12,15 @@ from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
import tempfile
|
||||
import uuid
|
||||
import hashlib
|
||||
import time
|
||||
import shutil
|
||||
import requests
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
from services.database import get_db
|
||||
from middleware.auth_middleware import get_current_user, get_current_user_with_query_token
|
||||
@@ -31,6 +39,124 @@ from ..models import (
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Thread pool for CPU/IO-intensive voice clone operations
|
||||
_audio_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="podcast_audio")
|
||||
|
||||
# In-memory LRU cache for voice samples (per user) to avoid re-downloading
|
||||
_voice_sample_cache: dict[str, tuple[float, bytes]] = {}
|
||||
_VOICE_SAMPLE_CACHE_TTL = 1800 # 30 minutes
|
||||
|
||||
|
||||
def _get_cached_voice_sample(cache_key: str) -> Optional[bytes]:
|
||||
"""Get voice sample bytes from in-memory cache if fresh."""
|
||||
if cache_key in _voice_sample_cache:
|
||||
ts, data = _voice_sample_cache[cache_key]
|
||||
if time.time() - ts < _VOICE_SAMPLE_CACHE_TTL:
|
||||
logger.debug(f"[Podcast] Voice sample cache hit for {cache_key[:16]}...")
|
||||
return data
|
||||
del _voice_sample_cache[cache_key]
|
||||
return None
|
||||
|
||||
|
||||
def _cache_voice_sample(cache_key: str, data: bytes) -> None:
|
||||
"""Store voice sample bytes in in-memory cache."""
|
||||
# Evict oldest entries if cache grows too large
|
||||
if len(_voice_sample_cache) > 50:
|
||||
oldest_key = min(_voice_sample_cache, key=lambda k: _voice_sample_cache[k][0])
|
||||
del _voice_sample_cache[oldest_key]
|
||||
_voice_sample_cache[cache_key] = (time.time(), data)
|
||||
|
||||
|
||||
def _get_latest_voice_sample_url(user_id: str, db) -> Optional[str]:
|
||||
"""Get the latest voice sample URL for a user from their voice clone assets."""
|
||||
try:
|
||||
from models.content_asset_models import ContentAsset, AssetType, AssetSource
|
||||
from sqlalchemy import desc
|
||||
|
||||
asset = db.query(ContentAsset).filter(
|
||||
ContentAsset.user_id == user_id,
|
||||
ContentAsset.asset_type == AssetType.AUDIO,
|
||||
ContentAsset.source_module == AssetSource.VOICE_CLONER,
|
||||
).order_by(desc(ContentAsset.created_at)).first()
|
||||
|
||||
if asset and asset.file_url:
|
||||
logger.info(f"[Podcast] Found voice sample for user {user_id}: {asset.file_url}")
|
||||
return asset.file_url
|
||||
|
||||
logger.warning(f"[Podcast] No voice sample asset found for user {user_id}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"[Podcast] Error fetching voice sample URL: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _fetch_voice_sample(voice_sample_url: str, user_id: str) -> Optional[bytes]:
|
||||
"""Fetch voice sample audio bytes from URL, with caching."""
|
||||
cache_key = hashlib.md5(f"{user_id}:{voice_sample_url}".encode()).hexdigest()
|
||||
|
||||
# Check in-memory cache first
|
||||
cached = _get_cached_voice_sample(cache_key)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
try:
|
||||
from utils.media_utils import resolve_media_path
|
||||
|
||||
# Try resolving as a local workspace path first (fastest)
|
||||
if "/api/assets/" in voice_sample_url:
|
||||
# Resolve user workspace path directly
|
||||
sanitized_uid = "".join(c for c in user_id if c.isalnum() or c in ("-", "_"))
|
||||
from api.podcast.constants import ROOT_DIR
|
||||
parts = voice_sample_url.split("/")
|
||||
# Expected: /api/assets/{user_id}/voice_samples/{filename}
|
||||
try:
|
||||
idx = parts.index("voice_samples")
|
||||
filename = parts[idx + 1].split("?")[0]
|
||||
local_path = ROOT_DIR / "workspace" / f"workspace_{sanitized_uid}" / "assets" / "voice_samples" / filename
|
||||
if local_path.exists():
|
||||
data = local_path.read_bytes()
|
||||
_cache_voice_sample(cache_key, data)
|
||||
logger.info(f"[Podcast] Voice sample loaded from workspace: {local_path}")
|
||||
return data
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
# Fall back to media utils resolver
|
||||
local_path = resolve_media_path(voice_sample_url)
|
||||
if local_path and local_path.exists():
|
||||
data = local_path.read_bytes()
|
||||
_cache_voice_sample(cache_key, data)
|
||||
return data
|
||||
|
||||
# Try resolving as a podcast audio file
|
||||
if "/api/podcast/audio/" in voice_sample_url:
|
||||
filename = voice_sample_url.split("/api/podcast/audio/")[-1].split("?")[0]
|
||||
try:
|
||||
audio_dir = get_podcast_media_dir("audio", user_id)
|
||||
local_path = audio_dir / filename
|
||||
if local_path.exists():
|
||||
data = local_path.read_bytes()
|
||||
_cache_voice_sample(cache_key, data)
|
||||
return data
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Try direct HTTP fetch as fallback
|
||||
if voice_sample_url.startswith("http"):
|
||||
logger.info(f"[Podcast] Fetching voice sample via HTTP: {voice_sample_url[:80]}...")
|
||||
resp = requests.get(voice_sample_url, timeout=30)
|
||||
if resp.status_code == 200:
|
||||
data = resp.content
|
||||
_cache_voice_sample(cache_key, data)
|
||||
logger.info(f"[Podcast] Voice sample fetched via HTTP ({len(data)} bytes)")
|
||||
return data
|
||||
|
||||
logger.warning(f"[Podcast] Could not fetch voice sample from: {voice_sample_url}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"[Podcast] Error fetching voice sample: {e}")
|
||||
return None
|
||||
|
||||
|
||||
@router.post("/audio/upload")
|
||||
async def upload_podcast_audio(
|
||||
@@ -125,35 +251,176 @@ async def generate_podcast_audio(
|
||||
raise HTTPException(status_code=400, detail="Text is required")
|
||||
|
||||
try:
|
||||
audio_service = get_podcast_audio_service(user_id)
|
||||
logger.warning(f"[Podcast] Generating audio with service dir: {audio_service.output_dir}")
|
||||
result: StoryAudioResult = audio_service.generate_ai_audio(
|
||||
scene_number=0,
|
||||
scene_title=request.scene_title,
|
||||
text=request.text.strip(),
|
||||
user_id=user_id,
|
||||
voice_id=request.voice_id or "Wise_Woman",
|
||||
custom_voice_id=request.custom_voice_id,
|
||||
speed=request.speed or 1.0, # Normal speed (was 0.9, but too slow - causing duration issues)
|
||||
volume=request.volume or 1.0,
|
||||
pitch=request.pitch or 0.0, # Normal pitch (0.0 = neutral)
|
||||
emotion=request.emotion or "neutral",
|
||||
english_normalization=request.english_normalization or False,
|
||||
sample_rate=request.sample_rate,
|
||||
bitrate=request.bitrate,
|
||||
channel=request.channel,
|
||||
format=request.format,
|
||||
language_boost=request.language_boost,
|
||||
enable_sync_mode=request.enable_sync_mode,
|
||||
# Determine if we should use voice clone path
|
||||
# Voice clone is used when: explicitly requested, OR when voice_id/custom_voice_id indicates a clone
|
||||
# (cloned voice IDs start with "vc_" or match the placeholder "MY_VOICE_CLONE")
|
||||
_vid = request.voice_id or ""
|
||||
_cvid = request.custom_voice_id or ""
|
||||
is_voice_clone = request.use_voice_clone or (
|
||||
_cvid.startswith("vc_") or _cvid == "MY_VOICE_CLONE"
|
||||
) or (
|
||||
_vid.startswith("vc_") or _vid == "MY_VOICE_CLONE"
|
||||
)
|
||||
|
||||
# Override URL to use podcast endpoint instead of story endpoint
|
||||
if result.get("audio_url") and "/api/story/audio/" in result.get("audio_url", ""):
|
||||
audio_filename = result.get("audio_filename", "")
|
||||
result["audio_url"] = f"/api/podcast/audio/{audio_filename}"
|
||||
|
||||
logger.warning(f"[Podcast] Audio generated - path: {result.get('audio_path')}, url: {result.get('audio_url')}")
|
||||
# If voice_id is a clone ID, normalize it to use Wise_Woman for TTS fallback
|
||||
effective_voice_id = _vid if not (_vid.startswith("vc_") or _vid == "MY_VOICE_CLONE") else "Wise_Woman"
|
||||
|
||||
logger.warning(f"[Podcast] Audio request: use_voice_clone={request.use_voice_clone}, voice_id={request.voice_id}, custom_voice_id={request.custom_voice_id}, is_voice_clone={is_voice_clone}, voice_sample_url={request.voice_sample_url}, voice_clone_engine={request.voice_clone_engine}")
|
||||
|
||||
# Voice clone path: use user's voice sample with scene text as reference
|
||||
if is_voice_clone:
|
||||
# If no voice_sample_url provided, try to fetch it from the user's latest voice clone
|
||||
voice_sample_url = request.voice_sample_url
|
||||
if not voice_sample_url:
|
||||
try:
|
||||
voice_sample_url = _get_latest_voice_sample_url(user_id, db)
|
||||
logger.warning(f"[Podcast] DB fallback voice sample URL for user {user_id}: {voice_sample_url}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[Podcast] Could not fetch voice sample URL: {e}")
|
||||
|
||||
if voice_sample_url:
|
||||
from services.llm_providers.main_audio_generation import qwen3_voice_clone, cosyvoice_voice_clone
|
||||
|
||||
engine = (request.voice_clone_engine or "qwen3").lower()
|
||||
logger.warning(f"[Podcast] 🔊 Voice clone path: engine={engine}, scene='{request.scene_title}', voice_sample_url={voice_sample_url[:80]}...")
|
||||
|
||||
# Download voice sample from URL (with caching)
|
||||
logger.warning(f"[Podcast] Fetching voice sample from: {voice_sample_url}")
|
||||
try:
|
||||
voice_sample_bytes = _fetch_voice_sample(voice_sample_url, user_id)
|
||||
except Exception as fetch_err:
|
||||
logger.error(f"[Podcast] ❌ Failed to fetch voice sample: {fetch_err}", exc_info=True)
|
||||
raise HTTPException(status_code=400, detail=f"Could not fetch voice sample: {str(fetch_err)}")
|
||||
logger.warning(f"[Podcast] Voice sample fetch result: {len(voice_sample_bytes) if voice_sample_bytes else 0} bytes")
|
||||
if not voice_sample_bytes:
|
||||
raise HTTPException(status_code=400, detail=f"Could not fetch voice sample from {voice_sample_url}")
|
||||
|
||||
scene_text = request.text.strip()
|
||||
if len(scene_text) > 4000:
|
||||
scene_text = scene_text[:4000]
|
||||
|
||||
# Run voice clone in thread pool to avoid blocking the event loop
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
try:
|
||||
if engine == "minimax":
|
||||
from services.llm_providers.main_audio_generation import clone_voice
|
||||
import random
|
||||
import string
|
||||
random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=8))
|
||||
custom_vid = request.custom_voice_id or f"vc_{random_suffix}"
|
||||
|
||||
result_obj = await loop.run_in_executor(
|
||||
_audio_executor,
|
||||
lambda cv=custom_vid: clone_voice(
|
||||
audio_bytes=voice_sample_bytes,
|
||||
custom_voice_id=cv,
|
||||
text=scene_text,
|
||||
user_id=user_id,
|
||||
),
|
||||
)
|
||||
audio_bytes = result_obj.preview_audio_bytes
|
||||
provider = "minimax"
|
||||
model = "minimax/voice-clone"
|
||||
elif engine == "cosyvoice":
|
||||
result_obj = await loop.run_in_executor(
|
||||
_audio_executor,
|
||||
lambda: cosyvoice_voice_clone(
|
||||
audio_bytes=voice_sample_bytes,
|
||||
text=scene_text,
|
||||
user_id=user_id,
|
||||
),
|
||||
)
|
||||
audio_bytes = result_obj.preview_audio_bytes
|
||||
provider = "wavespeed-ai"
|
||||
model = "wavespeed-ai/cosyvoice-tts/voice-clone"
|
||||
else:
|
||||
result_obj = await loop.run_in_executor(
|
||||
_audio_executor,
|
||||
lambda: qwen3_voice_clone(
|
||||
audio_bytes=voice_sample_bytes,
|
||||
text=scene_text,
|
||||
user_id=user_id,
|
||||
),
|
||||
)
|
||||
audio_bytes = result_obj.preview_audio_bytes
|
||||
provider = "wavespeed-ai"
|
||||
model = "wavespeed-ai/qwen3-tts/voice-clone"
|
||||
|
||||
logger.warning(f"[Podcast] 🔊 Voice clone result: {len(audio_bytes) if audio_bytes else 0} bytes, provider={provider}")
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as clone_err:
|
||||
logger.error(f"[Podcast] ❌ Voice clone failed: {clone_err}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"Voice clone generation failed: {str(clone_err)}")
|
||||
|
||||
# Save audio bytes to file
|
||||
audio_service = get_podcast_audio_service(user_id)
|
||||
audio_filename = f"scene_{request.scene_id}_{uuid.uuid4().hex[:8]}.mp3"
|
||||
audio_path = audio_service.output_dir / audio_filename
|
||||
|
||||
with open(audio_path, "wb") as f:
|
||||
f.write(audio_bytes)
|
||||
|
||||
file_size = len(audio_bytes)
|
||||
audio_url = f"/api/podcast/audio/{audio_filename}"
|
||||
cost = max(0.005, 0.005 * (len(scene_text) / 100.0))
|
||||
|
||||
result = {
|
||||
"audio_path": str(audio_path),
|
||||
"audio_filename": audio_filename,
|
||||
"audio_url": audio_url,
|
||||
"file_size": file_size,
|
||||
"provider": provider,
|
||||
"model": model,
|
||||
"cost": cost,
|
||||
"scene_number": 0,
|
||||
"scene_title": request.scene_title,
|
||||
}
|
||||
|
||||
else:
|
||||
# Standard TTS path - but NOT if custom_voice_id is a clone ID
|
||||
# Clone IDs (vc_*, MY_VOICE_CLONE) are not valid for minimax TTS
|
||||
if is_voice_clone:
|
||||
logger.warning(f"[Podcast] ⚠️ Voice clone detected but no voice sample available - falling back to standard TTS with voice_id={effective_voice_id}")
|
||||
effective_custom_voice_id = request.custom_voice_id
|
||||
if effective_custom_voice_id and (
|
||||
effective_custom_voice_id.startswith("vc_") or
|
||||
effective_custom_voice_id == "MY_VOICE_CLONE"
|
||||
):
|
||||
logger.warning(f"[Podcast] Ignoring clone ID '{effective_custom_voice_id}' in standard TTS path - no voice sample URL available")
|
||||
effective_custom_voice_id = None
|
||||
|
||||
audio_service = get_podcast_audio_service(user_id)
|
||||
logger.warning(f"[Podcast] Standard TTS path: voice_id={effective_voice_id}, custom_voice_id={effective_custom_voice_id}")
|
||||
result: StoryAudioResult = audio_service.generate_ai_audio(
|
||||
scene_number=0,
|
||||
scene_title=request.scene_title,
|
||||
text=request.text.strip(),
|
||||
user_id=user_id,
|
||||
voice_id=effective_voice_id,
|
||||
custom_voice_id=effective_custom_voice_id,
|
||||
speed=request.speed or 1.0, # Normal speed (was 0.9, but too slow - causing duration issues)
|
||||
volume=request.volume or 1.0,
|
||||
pitch=request.pitch or 0.0, # Normal pitch (0.0 = neutral)
|
||||
emotion=request.emotion or "neutral",
|
||||
english_normalization=request.english_normalization or False,
|
||||
sample_rate=request.sample_rate,
|
||||
bitrate=request.bitrate,
|
||||
channel=request.channel,
|
||||
format=request.format,
|
||||
language_boost=request.language_boost,
|
||||
enable_sync_mode=request.enable_sync_mode,
|
||||
)
|
||||
|
||||
# Override URL to use podcast endpoint instead of story endpoint
|
||||
if result.get("audio_url") and "/api/story/audio/" in result.get("audio_url", ""):
|
||||
audio_filename = result.get("audio_filename", "")
|
||||
result["audio_url"] = f"/api/podcast/audio/{audio_filename}"
|
||||
|
||||
logger.warning(f"[Podcast] Audio generated - path: {result.get('audio_path')}, url: {result.get('audio_url')}")
|
||||
except Exception as exc:
|
||||
logger.error(f"[Podcast] ❌ Audio generation failed: {exc}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail=f"Audio generation failed: {exc}")
|
||||
|
||||
# Save to asset library (podcast module)
|
||||
|
||||
Reference in New Issue
Block a user