feat: podcast demo mode with ALWRITY_ENABLED_FEATURES support

- Add ALWRITY_ENABLED_FEATURES env var for feature gating - Podcast-only mode: skip LLM bootstrap, scheduler, persona services - Enhance video generation prompt with scene context, analysis, narration - Add voice cloning support via custom_voice_id in WaveSpeed - Add text-to-speech for research results (browser speechSynthesis) - Fix render queue to sync images from script phase - Add WaveSpeed LLM pricing (gpt-oss-120b) - Fix podcast bible generation error handling - Refactor RouterManager for feature-based router loading
2026-04-03 06:59:59 +05:30
parent c52b1eabc9
commit 63bb937796
58 changed files with 3568 additions and 1597 deletions
--- a/backend/services/wavespeed/client.py
+++ b/backend/services/wavespeed/client.py
@@ -241,6 +241,7 @@ class WaveSpeedClient:
        self,
        text: str,
        voice_id: str,
+        custom_voice_id: Optional[str] = None,
        speed: float = 1.0,
        volume: float = 1.0,
        pitch: float = 0.0,
@@ -255,6 +256,7 @@ class WaveSpeedClient:
        Args:
            text: Text to convert to speech (max 10000 characters)
            voice_id: Voice ID (e.g., "Wise_Woman", "Friendly_Person", etc.)
+            custom_voice_id: Custom voice clone ID for using cloned voice
            speed: Speech speed (0.5-2.0, default: 1.0)
            volume: Speech volume (0.1-10.0, default: 1.0)
            pitch: Speech pitch (-12 to 12, default: 0.0)
@@ -269,6 +271,7 @@ class WaveSpeedClient:
        return self.speech.generate_speech(
            text=text,
            voice_id=voice_id,
+            custom_voice_id=custom_voice_id,
            speed=speed,
            volume=volume,
            pitch=pitch,
--- a/backend/services/wavespeed/generators/speech.py
+++ b/backend/services/wavespeed/generators/speech.py
@@ -40,6 +40,7 @@ class SpeechGenerator:
        self,
        text: str,
        voice_id: str,
+        custom_voice_id: Optional[str] = None,
        speed: float = 1.0,
        volume: float = 1.0,
        pitch: float = 0.0,
@@ -54,6 +55,7 @@ class SpeechGenerator:
        Args:
            text: Text to convert to speech (max 10000 characters)
            voice_id: Voice ID (e.g., "Wise_Woman", "Friendly_Person", etc.)
+            custom_voice_id: Custom voice clone ID for using cloned voice
            speed: Speech speed (0.5-2.0, default: 1.0)
            volume: Speech volume (0.1-10.0, default: 1.0)
            pitch: Speech pitch (-12 to 12, default: 0.0)
@@ -77,6 +79,11 @@ class SpeechGenerator:
        if not sanitized_voice_id:
            raise ValueError("Voice ID cannot be empty after sanitization")
        
+        # Sanitize custom_voice_id if provided
+        sanitized_custom_voice_id = None
+        if custom_voice_id:
+            sanitized_custom_voice_id = str(custom_voice_id).strip() or None
+        
        # Ensure numeric parameters are proper floats and within valid ranges
        sanitized_speed = max(0.5, min(2.0, float(speed))) if speed is not None else 1.0
        sanitized_volume = max(0.1, min(10.0, float(volume))) if volume is not None else 1.0
@@ -112,6 +119,10 @@ class SpeechGenerator:
            "enable_sync_mode": bool(enable_sync_mode),
        }
        
+        # Add custom voice clone ID if provided
+        if sanitized_custom_voice_id:
+            payload["custom_voice_id"] = sanitized_custom_voice_id
+        
        # Add optional parameters with proper type validation
        optional_params = [
            "english_normalization",
@@ -179,6 +190,20 @@ class SpeechGenerator:
        
        if response.status_code != 200:
            logger.error(f"[WaveSpeed] Speech generation failed: {response.status_code} {response.text}")
+            
+            # Check for custom voice ID specific errors
+            response_text = response.text.lower()
+            if "custom_voice" in response_text or "voice_id" in response_text:
+                raise HTTPException(
+                    status_code=400,
+                    detail={
+                        "error": "Invalid voice clone ID",
+                        "message": "The custom voice ID is invalid or expired. Please create a new voice clone or use a predefined voice.",
+                        "status_code": response.status_code,
+                        "response": response.text,
+                    },
+                )
+            
            raise HTTPException(
                status_code=502,
                detail={
--- a/backend/services/wavespeed/infinitetalk.py
+++ b/backend/services/wavespeed/infinitetalk.py
@@ -26,20 +26,24 @@ def _generate_simple_infinitetalk_prompt(
    story_context: Dict[str, Any],
 ) -> Optional[str]:
    """
-    Generate a balanced, concise prompt for InfiniteTalk.
-    InfiniteTalk is audio-driven, so the prompt should describe the scene and suggest
-    subtle motion, but avoid overly elaborate cinematic descriptions.
+    Generate an enhanced prompt for InfiniteTalk video generation.
+    Includes scene content, analysis, bible context, and visual elements.
    
    Returns None if no meaningful prompt can be generated.
    """
    title = (scene_data.get("title") or "").strip()
    description = (scene_data.get("description") or "").strip()
    image_prompt = (scene_data.get("image_prompt") or "").strip()
+    lines = scene_data.get("lines", [])
+    narration = ""
+    if lines:
+        # Combine first few lines for context
+        narration = " ".join([str(l.get("text", "")) for l in lines[:3]])[:150]
    
-    # Build a balanced prompt: scene description + simple motion hint
+    # Build enhanced prompt with multiple context sources
    parts = []
    
-    # Add scene context
+    # Add main scene title
    if title and len(title) > 5 and title.lower() not in ("scene", "podcast", "episode"):
        parts.append(title)
    
@@ -48,60 +52,70 @@ def _generate_simple_infinitetalk_prompt(
    if analysis:
        content_type = analysis.get("content_type")
        if content_type:
-             parts.append(f"Style: {content_type}")
+            parts.append(f"Content type: {content_type}")
        
-        # Audience helps define the formality/vibe
+        # Add key takeaways if available
+        key_takeaways = analysis.get("keyTakeaways", [])
+        if key_takeaways and isinstance(key_takeaways, list) and len(key_takeaways) > 0:
+            takeaway = str(key_takeaways[0])[:80]
+            if takeaway:
+                parts.append(f"Key insight: {takeaway}")
+        
+        # Audience
        audience = analysis.get("audience")
        if audience:
-             # Just use first few words of audience to keep it short
-             short_audience = " ".join(audience.split()[:3])
-             parts.append(f"For: {short_audience}")
-
-    # Add bible context if available
+            short_audience = " ".join(audience.split()[:3])
+            parts.append(f"Target audience: {short_audience}")
+        
+        # Guest info
+        guest_name = analysis.get("guestName")
+        guest_expertise = analysis.get("guestExpertise")
+        if guest_name:
+            parts.append(f"Guest: {guest_name}")
+        if guest_expertise:
+            parts.append(f"Expertise: {guest_expertise}")
+    
+    # Add bible context
    bible = story_context.get("bible", {})
    if bible:
        host_persona = bible.get("host_persona")
        tone = bible.get("tone")
+        visual_style = bible.get("visual_style")
+        background = bible.get("background")
+        
        if host_persona:
-            parts.append(f"Host: {host_persona}")
+            parts.append(f"Host persona: {host_persona}")
        if tone:
            parts.append(f"Tone: {tone}")
-
-    elif description:
-        # Take first sentence or first 60 chars
-        desc_part = description.split('.')[0][:60].strip()
-        if desc_part:
-            parts.append(desc_part)
-    elif image_prompt:
-        # Take first sentence or first 60 chars
-        img_part = image_prompt.split('.')[0][:60].strip()
+        if visual_style:
+            parts.append(f"Visual style: {visual_style}")
+        if background:
+            parts.append(f"Background: {background}")
+    
+    # Add original image prompt as fallback context
+    if image_prompt and len(parts) < 3:
+        img_part = image_prompt.split('.')[0][:100].strip()
        if img_part:
-            parts.append(img_part)
+            parts.append(f"Visual context: {img_part}")
+    
+    # Add narration snippet if available
+    if narration and len(parts) < 4:
+        parts.append(f"Discussing: {narration}")
    
    if not parts:
        return None
    
-    # Add a simple, subtle motion suggestion (not elaborate camera movements)
-    # Keep it natural and audio-driven
-    motion_hints = [
-        "with subtle movement",
-        "with gentle motion",
-        "with natural animation",
-    ]
+    # Build prompt with visual quality keywords
+    quality_keywords = "Cinematic lighting, high detail, 4k quality, smooth motion"
    
-    # Combine scene description with subtle motion hint
-    if len(parts[0]) < 80:
-        # Room for a motion hint
-        prompt = f"{parts[0]}, {motion_hints[0]}"
-    else:
-        # Just use the description if it's already long enough
-        prompt = parts[0]
+    # Combine parts into final prompt
+    prompt = f"{'. '.join(parts)}. {quality_keywords}. With subtle natural movement."
    
-    # Keep it concise - max 120 characters (allows for scene + motion hint)
-    prompt = prompt[:120].strip()
+    # Allow more room for detailed prompts - max 350 characters
+    prompt = prompt[:350].strip()
    
-    # Clean up trailing commas or incomplete sentences
-    if prompt.endswith(','):
+    # Clean up trailing punctuation
+    if prompt.endswith(',') or prompt.endswith('.'):
        prompt = prompt[:-1].strip()
    
    return prompt if len(prompt) >= 15 else None