feat: voice clone audio generation + podcast workspace architecture

- Voice clone integration: When user selects voice clone in Write phase, backend uses their uploaded voice sample + scene script text to generate audio via qwen3/minimax/cosyvoice voice clone APIs - Multi-tenant workspace storage: All podcast assets (audio, video, images, charts) now use workspace-specific directories per user - Chart preview improvements: Card-based B-Roll charts UI with thumbnails, takeaway text, and action buttons; public endpoint for image serving - Voice clone caching: In-memory LRU cache for voice samples (avoids re-downloading per scene); frontend caches voice clone metadata - Thread pool for voice clone: Audio generation uses ThreadPoolExecutor to avoid blocking the FastAPI event loop - Auto-detect voice clone IDs (vc_*, MY_VOICE_CLONE) to route correctly - DB fallback for voice sample URL: Fetches from ContentAsset if not passed - Fixed API URL resolution for chart previews - Fixed GlassyCard DOM warnings for motion props - Fixed ScriptGenerationProgressView syntax error - Fixed usePodcastWorkflow scriptData reference
2026-04-21 19:38:50 +05:30
parent 7637babd7d
commit 91b2f996fd
33 changed files with 1642 additions and 457 deletions
--- a/frontend/src/components/PodcastMaker/ScriptEditor/SceneEditor.tsx
+++ b/frontend/src/components/PodcastMaker/ScriptEditor/SceneEditor.tsx
@@ -16,7 +16,7 @@ import { GlassyCard, glassyCardSx, PrimaryButton } from "../ui";
 import { LineEditor } from "./LineEditor";
 import { ImageRegenerateModal, ImageGenerationSettings } from "./ImageRegenerateModal";
 import { AudioRegenerateModal, AudioGenerationSettings } from "./AudioRegenerateModal";
-import { podcastApi } from "../../../services/podcastApi";
+import { podcastApi, getCachedVoiceCloneInfo } from "../../../services/podcastApi";
 import { aiApiClient } from "../../../api/client";
 import { getCachedMedia, setCachedMedia } from "../../../utils/mediaCache";

@@ -68,6 +68,9 @@ export const SceneEditor: React.FC<SceneEditorProps> = ({
  const [audioSettings, setAudioSettings] = useState<AudioGenerationSettings>({
    voiceId: knobs.voice_id || "Wise_Woman",
    customVoiceId: knobs.custom_voice_id || undefined,
+    useVoiceClone: knobs.is_voice_clone || false,
+    voiceSampleUrl: knobs.voice_sample_url || undefined,
+    voiceCloneEngine: knobs.voice_clone_engine || undefined,
    speed: knobs.voice_speed ?? 1.0,
    volume: 1.0,
    pitch: 0.0,
@@ -308,10 +311,14 @@ export const SceneEditor: React.FC<SceneEditorProps> = ({
      
      // Generate audio
      const effectiveSettings = settings || audioSettings;
+      const cachedClone = getCachedVoiceCloneInfo();
      const result = await podcastApi.renderSceneAudio({
        scene: currentScene,
        voiceId: effectiveSettings.voiceId || knobs.voice_id || "Wise_Woman",
-        customVoiceId: effectiveSettings.customVoiceId || knobs.custom_voice_id,
+        customVoiceId: effectiveSettings.customVoiceId || knobs.custom_voice_id || cachedClone?.customVoiceId,
+        useVoiceClone: effectiveSettings.useVoiceClone || knobs.is_voice_clone || cachedClone?.isVoiceClone || false,
+        voiceSampleUrl: effectiveSettings.voiceSampleUrl || knobs.voice_sample_url || cachedClone?.voiceSampleUrl || undefined,
+        voiceCloneEngine: effectiveSettings.voiceCloneEngine || knobs.voice_clone_engine || cachedClone?.engine || undefined,
        emotion: effectiveSettings.emotion || scene.emotion || knobs.voice_emotion || "neutral",
        speed: effectiveSettings.speed ?? knobs.voice_speed ?? 1.0,
        volume: effectiveSettings.volume ?? 1.0,