feat: voice clone audio generation + podcast workspace architecture

- Voice clone integration: When user selects voice clone in Write phase, backend uses their uploaded voice sample + scene script text to generate audio via qwen3/minimax/cosyvoice voice clone APIs - Multi-tenant workspace storage: All podcast assets (audio, video, images, charts) now use workspace-specific directories per user - Chart preview improvements: Card-based B-Roll charts UI with thumbnails, takeaway text, and action buttons; public endpoint for image serving - Voice clone caching: In-memory LRU cache for voice samples (avoids re-downloading per scene); frontend caches voice clone metadata - Thread pool for voice clone: Audio generation uses ThreadPoolExecutor to avoid blocking the FastAPI event loop - Auto-detect voice clone IDs (vc_*, MY_VOICE_CLONE) to route correctly - DB fallback for voice sample URL: Fetches from ContentAsset if not passed - Fixed API URL resolution for chart previews - Fixed GlassyCard DOM warnings for motion props - Fixed ScriptGenerationProgressView syntax error - Fixed usePodcastWorkflow scriptData reference
2026-04-21 19:38:50 +05:30
parent 7637babd7d
commit 91b2f996fd
33 changed files with 1642 additions and 457 deletions
--- a/backend/api/podcast/handlers/audio.py
+++ b/backend/api/podcast/handlers/audio.py
@@ -12,7 +12,15 @@ from pathlib import Path
 from urllib.parse import urlparse
 import tempfile
 import uuid
+import hashlib
+import time
 import shutil
+import requests
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+
+import asyncio
+from concurrent.futures import ThreadPoolExecutor

 from services.database import get_db
 from middleware.auth_middleware import get_current_user, get_current_user_with_query_token
@@ -31,6 +39,124 @@ from ..models import (

 router = APIRouter()

+# Thread pool for CPU/IO-intensive voice clone operations
+_audio_executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="podcast_audio")
+
+# In-memory LRU cache for voice samples (per user) to avoid re-downloading
+_voice_sample_cache: dict[str, tuple[float, bytes]] = {}
+_VOICE_SAMPLE_CACHE_TTL = 1800  # 30 minutes
+
+
+def _get_cached_voice_sample(cache_key: str) -> Optional[bytes]:
+    """Get voice sample bytes from in-memory cache if fresh."""
+    if cache_key in _voice_sample_cache:
+        ts, data = _voice_sample_cache[cache_key]
+        if time.time() - ts < _VOICE_SAMPLE_CACHE_TTL:
+            logger.debug(f"[Podcast] Voice sample cache hit for {cache_key[:16]}...")
+            return data
+        del _voice_sample_cache[cache_key]
+    return None
+
+
+def _cache_voice_sample(cache_key: str, data: bytes) -> None:
+    """Store voice sample bytes in in-memory cache."""
+    # Evict oldest entries if cache grows too large
+    if len(_voice_sample_cache) > 50:
+        oldest_key = min(_voice_sample_cache, key=lambda k: _voice_sample_cache[k][0])
+        del _voice_sample_cache[oldest_key]
+    _voice_sample_cache[cache_key] = (time.time(), data)
+
+
+def _get_latest_voice_sample_url(user_id: str, db) -> Optional[str]:
+    """Get the latest voice sample URL for a user from their voice clone assets."""
+    try:
+        from models.content_asset_models import ContentAsset, AssetType, AssetSource
+        from sqlalchemy import desc
+        
+        asset = db.query(ContentAsset).filter(
+            ContentAsset.user_id == user_id,
+            ContentAsset.asset_type == AssetType.AUDIO,
+            ContentAsset.source_module == AssetSource.VOICE_CLONER,
+        ).order_by(desc(ContentAsset.created_at)).first()
+        
+        if asset and asset.file_url:
+            logger.info(f"[Podcast] Found voice sample for user {user_id}: {asset.file_url}")
+            return asset.file_url
+        
+        logger.warning(f"[Podcast] No voice sample asset found for user {user_id}")
+        return None
+    except Exception as e:
+        logger.error(f"[Podcast] Error fetching voice sample URL: {e}")
+        return None
+
+
+def _fetch_voice_sample(voice_sample_url: str, user_id: str) -> Optional[bytes]:
+    """Fetch voice sample audio bytes from URL, with caching."""
+    cache_key = hashlib.md5(f"{user_id}:{voice_sample_url}".encode()).hexdigest()
+    
+    # Check in-memory cache first
+    cached = _get_cached_voice_sample(cache_key)
+    if cached is not None:
+        return cached
+    
+    try:
+        from utils.media_utils import resolve_media_path
+
+        # Try resolving as a local workspace path first (fastest)
+        if "/api/assets/" in voice_sample_url:
+            # Resolve user workspace path directly
+            sanitized_uid = "".join(c for c in user_id if c.isalnum() or c in ("-", "_"))
+            from api.podcast.constants import ROOT_DIR
+            parts = voice_sample_url.split("/")
+            # Expected: /api/assets/{user_id}/voice_samples/{filename}
+            try:
+                idx = parts.index("voice_samples")
+                filename = parts[idx + 1].split("?")[0]
+                local_path = ROOT_DIR / "workspace" / f"workspace_{sanitized_uid}" / "assets" / "voice_samples" / filename
+                if local_path.exists():
+                    data = local_path.read_bytes()
+                    _cache_voice_sample(cache_key, data)
+                    logger.info(f"[Podcast] Voice sample loaded from workspace: {local_path}")
+                    return data
+            except (ValueError, IndexError):
+                pass
+
+            # Fall back to media utils resolver
+            local_path = resolve_media_path(voice_sample_url)
+            if local_path and local_path.exists():
+                data = local_path.read_bytes()
+                _cache_voice_sample(cache_key, data)
+                return data
+
+        # Try resolving as a podcast audio file
+        if "/api/podcast/audio/" in voice_sample_url:
+            filename = voice_sample_url.split("/api/podcast/audio/")[-1].split("?")[0]
+            try:
+                audio_dir = get_podcast_media_dir("audio", user_id)
+                local_path = audio_dir / filename
+                if local_path.exists():
+                    data = local_path.read_bytes()
+                    _cache_voice_sample(cache_key, data)
+                    return data
+            except Exception:
+                pass
+
+        # Try direct HTTP fetch as fallback
+        if voice_sample_url.startswith("http"):
+            logger.info(f"[Podcast] Fetching voice sample via HTTP: {voice_sample_url[:80]}...")
+            resp = requests.get(voice_sample_url, timeout=30)
+            if resp.status_code == 200:
+                data = resp.content
+                _cache_voice_sample(cache_key, data)
+                logger.info(f"[Podcast] Voice sample fetched via HTTP ({len(data)} bytes)")
+                return data
+
+        logger.warning(f"[Podcast] Could not fetch voice sample from: {voice_sample_url}")
+        return None
+    except Exception as e:
+        logger.error(f"[Podcast] Error fetching voice sample: {e}")
+        return None
+

@router.post("/audio/upload")
 async def upload_podcast_audio(
@@ -125,35 +251,176 @@ async def generate_podcast_audio(
        raise HTTPException(status_code=400, detail="Text is required")

    try:
-        audio_service = get_podcast_audio_service(user_id)
-        logger.warning(f"[Podcast] Generating audio with service dir: {audio_service.output_dir}")
-        result: StoryAudioResult = audio_service.generate_ai_audio(
-            scene_number=0,
-            scene_title=request.scene_title,
-            text=request.text.strip(),
-            user_id=user_id,
-            voice_id=request.voice_id or "Wise_Woman",
-            custom_voice_id=request.custom_voice_id,
-            speed=request.speed or 1.0,  # Normal speed (was 0.9, but too slow - causing duration issues)
-            volume=request.volume or 1.0,
-            pitch=request.pitch or 0.0,  # Normal pitch (0.0 = neutral)
-            emotion=request.emotion or "neutral",
-            english_normalization=request.english_normalization or False,
-            sample_rate=request.sample_rate,
-            bitrate=request.bitrate,
-            channel=request.channel,
-            format=request.format,
-            language_boost=request.language_boost,
-            enable_sync_mode=request.enable_sync_mode,
+        # Determine if we should use voice clone path
+        # Voice clone is used when: explicitly requested, OR when voice_id/custom_voice_id indicates a clone
+        # (cloned voice IDs start with "vc_" or match the placeholder "MY_VOICE_CLONE")
+        _vid = request.voice_id or ""
+        _cvid = request.custom_voice_id or ""
+        is_voice_clone = request.use_voice_clone or (
+            _cvid.startswith("vc_") or _cvid == "MY_VOICE_CLONE"
+        ) or (
+            _vid.startswith("vc_") or _vid == "MY_VOICE_CLONE"
        )
        
-        # Override URL to use podcast endpoint instead of story endpoint
-        if result.get("audio_url") and "/api/story/audio/" in result.get("audio_url", ""):
-            audio_filename = result.get("audio_filename", "")
-            result["audio_url"] = f"/api/podcast/audio/{audio_filename}"
-        
-        logger.warning(f"[Podcast] Audio generated - path: {result.get('audio_path')}, url: {result.get('audio_url')}")
+        # If voice_id is a clone ID, normalize it to use Wise_Woman for TTS fallback
+        effective_voice_id = _vid if not (_vid.startswith("vc_") or _vid == "MY_VOICE_CLONE") else "Wise_Woman"
+
+        logger.warning(f"[Podcast] Audio request: use_voice_clone={request.use_voice_clone}, voice_id={request.voice_id}, custom_voice_id={request.custom_voice_id}, is_voice_clone={is_voice_clone}, voice_sample_url={request.voice_sample_url}, voice_clone_engine={request.voice_clone_engine}")
+
+        # Voice clone path: use user's voice sample with scene text as reference
+        if is_voice_clone:
+            # If no voice_sample_url provided, try to fetch it from the user's latest voice clone
+            voice_sample_url = request.voice_sample_url
+            if not voice_sample_url:
+                try:
+                    voice_sample_url = _get_latest_voice_sample_url(user_id, db)
+                    logger.warning(f"[Podcast] DB fallback voice sample URL for user {user_id}: {voice_sample_url}")
+                except Exception as e:
+                    logger.warning(f"[Podcast] Could not fetch voice sample URL: {e}")
+
+            if voice_sample_url:
+                from services.llm_providers.main_audio_generation import qwen3_voice_clone, cosyvoice_voice_clone
+                
+                engine = (request.voice_clone_engine or "qwen3").lower()
+                logger.warning(f"[Podcast] 🔊 Voice clone path: engine={engine}, scene='{request.scene_title}', voice_sample_url={voice_sample_url[:80]}...")
+
+                # Download voice sample from URL (with caching)
+                logger.warning(f"[Podcast] Fetching voice sample from: {voice_sample_url}")
+                try:
+                    voice_sample_bytes = _fetch_voice_sample(voice_sample_url, user_id)
+                except Exception as fetch_err:
+                    logger.error(f"[Podcast] ❌ Failed to fetch voice sample: {fetch_err}", exc_info=True)
+                    raise HTTPException(status_code=400, detail=f"Could not fetch voice sample: {str(fetch_err)}")
+                logger.warning(f"[Podcast] Voice sample fetch result: {len(voice_sample_bytes) if voice_sample_bytes else 0} bytes")
+                if not voice_sample_bytes:
+                    raise HTTPException(status_code=400, detail=f"Could not fetch voice sample from {voice_sample_url}")
+
+                scene_text = request.text.strip()
+                if len(scene_text) > 4000:
+                    scene_text = scene_text[:4000]
+
+                # Run voice clone in thread pool to avoid blocking the event loop
+                loop = asyncio.get_event_loop()
+                
+                try:
+                    if engine == "minimax":
+                        from services.llm_providers.main_audio_generation import clone_voice
+                        import random
+                        import string
+                        random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=8))
+                        custom_vid = request.custom_voice_id or f"vc_{random_suffix}"
+                        
+                        result_obj = await loop.run_in_executor(
+                            _audio_executor,
+                            lambda cv=custom_vid: clone_voice(
+                                audio_bytes=voice_sample_bytes,
+                                custom_voice_id=cv,
+                                text=scene_text,
+                                user_id=user_id,
+                            ),
+                        )
+                        audio_bytes = result_obj.preview_audio_bytes
+                        provider = "minimax"
+                        model = "minimax/voice-clone"
+                    elif engine == "cosyvoice":
+                        result_obj = await loop.run_in_executor(
+                            _audio_executor,
+                            lambda: cosyvoice_voice_clone(
+                                audio_bytes=voice_sample_bytes,
+                                text=scene_text,
+                                user_id=user_id,
+                            ),
+                        )
+                        audio_bytes = result_obj.preview_audio_bytes
+                        provider = "wavespeed-ai"
+                        model = "wavespeed-ai/cosyvoice-tts/voice-clone"
+                    else:
+                        result_obj = await loop.run_in_executor(
+                            _audio_executor,
+                            lambda: qwen3_voice_clone(
+                                audio_bytes=voice_sample_bytes,
+                                text=scene_text,
+                                user_id=user_id,
+                            ),
+                        )
+                        audio_bytes = result_obj.preview_audio_bytes
+                        provider = "wavespeed-ai"
+                        model = "wavespeed-ai/qwen3-tts/voice-clone"
+                    
+                    logger.warning(f"[Podcast] 🔊 Voice clone result: {len(audio_bytes) if audio_bytes else 0} bytes, provider={provider}")
+                except HTTPException:
+                    raise
+                except Exception as clone_err:
+                    logger.error(f"[Podcast] ❌ Voice clone failed: {clone_err}", exc_info=True)
+                    raise HTTPException(status_code=500, detail=f"Voice clone generation failed: {str(clone_err)}")
+
+            # Save audio bytes to file
+            audio_service = get_podcast_audio_service(user_id)
+            audio_filename = f"scene_{request.scene_id}_{uuid.uuid4().hex[:8]}.mp3"
+            audio_path = audio_service.output_dir / audio_filename
+            
+            with open(audio_path, "wb") as f:
+                f.write(audio_bytes)
+            
+            file_size = len(audio_bytes)
+            audio_url = f"/api/podcast/audio/{audio_filename}"
+            cost = max(0.005, 0.005 * (len(scene_text) / 100.0))
+
+            result = {
+                "audio_path": str(audio_path),
+                "audio_filename": audio_filename,
+                "audio_url": audio_url,
+                "file_size": file_size,
+                "provider": provider,
+                "model": model,
+                "cost": cost,
+                "scene_number": 0,
+                "scene_title": request.scene_title,
+            }
+
+        else:
+            # Standard TTS path - but NOT if custom_voice_id is a clone ID
+            # Clone IDs (vc_*, MY_VOICE_CLONE) are not valid for minimax TTS
+            if is_voice_clone:
+                logger.warning(f"[Podcast] ⚠️ Voice clone detected but no voice sample available - falling back to standard TTS with voice_id={effective_voice_id}")
+            effective_custom_voice_id = request.custom_voice_id
+            if effective_custom_voice_id and (
+                effective_custom_voice_id.startswith("vc_") or
+                effective_custom_voice_id == "MY_VOICE_CLONE"
+            ):
+                logger.warning(f"[Podcast] Ignoring clone ID '{effective_custom_voice_id}' in standard TTS path - no voice sample URL available")
+                effective_custom_voice_id = None
+            
+            audio_service = get_podcast_audio_service(user_id)
+            logger.warning(f"[Podcast] Standard TTS path: voice_id={effective_voice_id}, custom_voice_id={effective_custom_voice_id}")
+            result: StoryAudioResult = audio_service.generate_ai_audio(
+                scene_number=0,
+                scene_title=request.scene_title,
+                text=request.text.strip(),
+                user_id=user_id,
+                voice_id=effective_voice_id,
+                custom_voice_id=effective_custom_voice_id,
+                speed=request.speed or 1.0,  # Normal speed (was 0.9, but too slow - causing duration issues)
+                volume=request.volume or 1.0,
+                pitch=request.pitch or 0.0,  # Normal pitch (0.0 = neutral)
+                emotion=request.emotion or "neutral",
+                english_normalization=request.english_normalization or False,
+                sample_rate=request.sample_rate,
+                bitrate=request.bitrate,
+                channel=request.channel,
+                format=request.format,
+                language_boost=request.language_boost,
+                enable_sync_mode=request.enable_sync_mode,
+            )
+            
+            # Override URL to use podcast endpoint instead of story endpoint
+            if result.get("audio_url") and "/api/story/audio/" in result.get("audio_url", ""):
+                audio_filename = result.get("audio_filename", "")
+                result["audio_url"] = f"/api/podcast/audio/{audio_filename}"
+            
+            logger.warning(f"[Podcast] Audio generated - path: {result.get('audio_path')}, url: {result.get('audio_url')}")
    except Exception as exc:
+        logger.error(f"[Podcast] ❌ Audio generation failed: {exc}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Audio generation failed: {exc}")

    # Save to asset library (podcast module)