feat: podcast demo mode with ALWRITY_ENABLED_FEATURES support

- Add ALWRITY_ENABLED_FEATURES env var for feature gating - Podcast-only mode: skip LLM bootstrap, scheduler, persona services - Enhance video generation prompt with scene context, analysis, narration - Add voice cloning support via custom_voice_id in WaveSpeed - Add text-to-speech for research results (browser speechSynthesis) - Fix render queue to sync images from script phase - Add WaveSpeed LLM pricing (gpt-oss-120b) - Fix podcast bible generation error handling - Refactor RouterManager for feature-based router loading
2026-04-03 06:59:59 +05:30
parent c52b1eabc9
commit 63bb937796
58 changed files with 3568 additions and 1597 deletions
--- a/backend/services/llm_providers/main_audio_generation.py
+++ b/backend/services/llm_providers/main_audio_generation.py
@@ -62,6 +62,7 @@ class VoiceCloneResult:
 def generate_audio(
    text: str,
    voice_id: str = "Wise_Woman",
+    custom_voice_id: Optional[str] = None,
    speed: float = 1.0,
    volume: float = 1.0,
    pitch: float = 0.0,
@@ -173,6 +174,7 @@ def generate_audio(
            audio_bytes = client.generate_speech(
                text=text,
                voice_id=voice_id,
+                custom_voice_id=custom_voice_id,
                speed=speed,
                volume=volume,
                pitch=pitch,
--- a/backend/services/llm_providers/main_text_generation.py
+++ b/backend/services/llm_providers/main_text_generation.py
@@ -67,7 +67,7 @@ def llm_text_gen(
        resolved_flow_type = flow_type or ("sif_agent" if preferred_hf_models else "premium_tool")
        flow_tag = f"flow_type={resolved_flow_type}"
        
-        logger.info(f"[llm_text_gen][{flow_tag}] Starting text generation")
+        logger.warning(f"[llm_text_gen][{flow_tag}] Starting text generation")
        logger.debug(f"[llm_text_gen] Prompt length: {len(prompt)} characters")
        
        # Set default values for LLM parameters
@@ -94,7 +94,7 @@ def llm_text_gen(
            primary_provider = provider_list[0]
            if primary_provider in ['wavespeed', 'wave']:
                gpt_provider = "wavespeed"
-                model = os.getenv('WAVESPEED_TEXT_MODEL', 'openai/gpt-oss-120b:cerebras')
+                model = os.getenv('WAVESPEED_TEXT_MODEL', 'openai/gpt-oss-120b')
            elif primary_provider in ['gemini', 'google']:
                gpt_provider = "google"
                model = "gemini-2.0-flash-001"
@@ -111,7 +111,7 @@ def llm_text_gen(
        elif preferred_provider:
            if preferred_provider in ['wavespeed', 'wave']:
                gpt_provider = "wavespeed"
-                model = os.getenv('WAVESPEED_TEXT_MODEL', 'openai/gpt-oss-120b:cerebras')
+                model = os.getenv('WAVESPEED_TEXT_MODEL', 'openai/gpt-oss-120b')
            elif preferred_provider in ['openai', 'gpt']:
                gpt_provider = "openai"
                model = os.getenv('OPENAI_MODEL', 'gpt-4o-mini')
@@ -166,7 +166,7 @@ def llm_text_gen(
        if api_key_manager.get_api_key("wavespeed"):
            available_providers.append("wavespeed")
        
-        logger.info(
+        logger.warning(
            f"[llm_text_gen][{flow_tag}] Provider preflight: env_provider='{env_provider or 'auto'}', "
            f"provider_list={provider_list}, strict_provider_mode={strict_provider_mode}, "
            f"available_providers={available_providers}, preferred_provider={preferred_provider or 'none'}, "
@@ -278,7 +278,12 @@ def llm_text_gen(
                    UsageSummary.billing_period == current_period
                ).first()
                
-                # No separate log here - we'll create unified log after API call and usage tracking
+                # Log subscription details before making the API call
+                if usage:
+                    total_llm_calls = (usage.gemini_calls or 0) + (usage.openai_calls or 0) + (usage.anthropic_calls or 0) + (usage.mistral_calls or 0) + (usage.wavespeed_calls or 0)
+                    logger.info(f"[llm_text_gen] Subscription check passed for user {user_id}: provider={actual_provider_name or gpt_provider}, tokens_requested={estimated_total_tokens}, current_usage=${usage.total_cost or 0:.4f}, calls_used={total_llm_calls}")
+                else:
+                    logger.info(f"[llm_text_gen] Subscription check passed for user {user_id}: provider={actual_provider_name or gpt_provider}, tokens_requested={estimated_total_tokens}, new_user_no_usage_record")
                
            finally:
                db.close()
@@ -363,7 +368,7 @@ def llm_text_gen(
                from services.llm_providers.wavespeed_provider import wavespeed_text_response
                response_text = wavespeed_text_response(
                    prompt=prompt,
-                    model=model or "openai/gpt-oss-120b:cerebras",
+                    model=model or "openai/gpt-oss-120b",
                    temperature=temperature,
                    max_tokens=max_tokens,
                    top_p=top_p,
--- a/backend/services/podcast_bible_service.py
+++ b/backend/services/podcast_bible_service.py
@@ -15,14 +15,31 @@ class PodcastBibleService:
    """Service for generating and managing the Podcast Bible."""

    def __init__(self):
-        self.personalization_service = PersonalizationService()
+        try:
+            from services.product_marketing.personalization_service import PersonalizationService
+            self.personalization_service = PersonalizationService()
+        except Exception as e:
+            logger.warning(f"Failed to initialize PersonalizationService: {e}")
+            self.personalization_service = None

    def generate_bible(self, user_id: str, project_id: str) -> PodcastBible:
        """Generate a Podcast Bible from onboarding data."""
        logger.info(f"Generating Podcast Bible for user {user_id}")
        
        try:
-            preferences = self.personalization_service.get_user_preferences(user_id) or {}
+            if not self.personalization_service:
+                logger.warning("PersonalizationService not available, using default bible")
+                return self._get_default_bible(project_id)
+            
+            try:
+                preferences = self.personalization_service.get_user_preferences(user_id)
+            except Exception as pref_err:
+                logger.warning(f"Failed to get user preferences: {pref_err}, using defaults")
+                return self._get_default_bible(project_id)
+            
+            if not preferences:
+                logger.info(f"No preferences found for user {user_id}, using defaults")
+                return self._get_default_bible(project_id)
            if not isinstance(preferences, dict):
                logger.warning(f"Podcast Bible preferences payload is non-dict for user {user_id}, using defaults")
                preferences = {}
@@ -129,18 +146,23 @@ class PodcastBibleService:
                name="AI Host",
                background="Industry Professional",
                expertise_level="Expert",
+                personality_traits=["Professional", "Informative"],
                vocal_style="Authoritative",
-                vocal_characteristics=["Deep", "Steady"]
+                vocal_characteristics=["Deep", "Steady"],
+                look="A professional individual dressed in business-casual attire."
            ),
            audience=AudienceDNA(
                expertise_level="Intermediate",
                interests=["Industry Trends", "Technology"],
-                pain_points=["Staying Competitive", "Operational Efficiency"]
+                pain_points=["Staying Competitive", "Operational Efficiency"],
+                demographics=None
            ),
            brand=BrandDNA(
                industry="General Business",
                tone="Professional",
-                communication_style="Analytical"
+                communication_style="Analytical",
+                key_messages=[],
+                competitor_context=None
            ),
            visual_style=VisualStyle(
                environment="Professional modern office studio",
--- a/backend/services/startup_health.py
+++ b/backend/services/startup_health.py
@@ -156,6 +156,12 @@ def _check_production_api_key_loading(
    if deploy_env == "local":
        _record_check(checks, "production_api_key_loading", True, "skipped in local deploy mode")
        return
+    
+    # Also skip in podcast-only mode (no production API keys needed)
+    enabled_features = os.getenv("ALWRITY_ENABLED_FEATURES", "all").strip().lower()
+    if enabled_features == "podcast":
+        _record_check(checks, "production_api_key_loading", True, "skipped in podcast-only mode")
+        return

    test_tenant_id = os.getenv("ALWRITY_STARTUP_TEST_TENANT_ID", "").strip()
    if not test_tenant_id:
--- a/backend/services/story_writer/audio_generation_service.py
+++ b/backend/services/story_writer/audio_generation_service.py
@@ -46,6 +46,7 @@ class StoryAudioGenerationService:
            return _get_story_media_write_dir("audio", user_id=user_id, db=db)
        except Exception as e:
            logger.warning(f"[StoryAudioGeneration] Failed to resolve user workspace path for {user_id}: {e}")
+            # Don't fall back to default - keep using the already-set output_dir for podcast
            return self.output_dir

    def _generate_audio_filename(self, scene_number: int, scene_title: str) -> str:
@@ -318,6 +319,7 @@ class StoryAudioGenerationService:
        text: str,
        user_id: str,
        voice_id: str = "Wise_Woman",
+        custom_voice_id: Optional[str] = None,
        speed: float = 1.0,
        volume: float = 1.0,
        pitch: float = 0.0,
@@ -364,6 +366,7 @@ class StoryAudioGenerationService:
            result = generate_audio(
                text=text.strip(),
                voice_id=voice_id,
+                custom_voice_id=custom_voice_id,
                speed=speed,
                volume=volume,
                pitch=pitch,
@@ -378,8 +381,8 @@ class StoryAudioGenerationService:
                enable_sync_mode=enable_sync_mode,
            )
            
-            # Determine output directory (user workspace or default)
-            output_dir = self._get_user_audio_dir(user_id, db)
+            # Use the output_dir that was set when service was created (already handles podcast vs story)
+            output_dir = self.output_dir
            
            # Save audio to file
            audio_filename = self._generate_audio_filename(scene_number, scene_title)
--- a/backend/services/subscription/pricing_service.py
+++ b/backend/services/subscription/pricing_service.py
@@ -442,9 +442,34 @@ class PricingService:
                "description": "AI Audio Generation default pricing"
            }
        ]
+
+        # WaveSpeed LLM Text Generation Pricing (via Cerebras)
+        wavespeed_llm_pricing = [
+            {
+                "provider": APIProvider.WAVESPEED,
+                "model_name": "openai/gpt-oss-120b",
+                "cost_per_input_token": 0.0000006,   # $0.60 per 1M input tokens
+                "cost_per_output_token": 0.0000006,  # $0.60 per 1M output tokens
+                "description": "WaveSpeed GPT-OSS 120B (Cerebras) - Fast text generation"
+            },
+            {
+                "provider": APIProvider.WAVESPEED,
+                "model_name": "openai/gpt-oss-120b:cerebras",
+                "cost_per_input_token": 0.0000006,
+                "cost_per_output_token": 0.0000006,
+                "description": "WaveSpeed GPT-OSS 120B (Cerebras) - Fast text generation"
+            },
+            {
+                "provider": APIProvider.WAVESPEED,
+                "model_name": "openai/gpt-oss-20b",
+                "cost_per_input_token": 0.0000002,   # $0.20 per 1M input tokens
+                "cost_per_output_token": 0.0000002,  # $0.20 per 1M output tokens
+                "description": "WaveSpeed GPT-OSS 20B (Cerebras) - Cost-effective text generation"
+            },
+        ]
        
        # Combine all pricing data (include video pricing in search_pricing list)
-        all_pricing = gemini_pricing + openai_pricing + anthropic_pricing + mistral_pricing + search_pricing
+        all_pricing = gemini_pricing + openai_pricing + anthropic_pricing + mistral_pricing + search_pricing + wavespeed_llm_pricing
        
        # Insert or update pricing data
        for pricing_data in all_pricing:
--- a/backend/services/wavespeed/client.py
+++ b/backend/services/wavespeed/client.py
@@ -241,6 +241,7 @@ class WaveSpeedClient:
        self,
        text: str,
        voice_id: str,
+        custom_voice_id: Optional[str] = None,
        speed: float = 1.0,
        volume: float = 1.0,
        pitch: float = 0.0,
@@ -255,6 +256,7 @@ class WaveSpeedClient:
        Args:
            text: Text to convert to speech (max 10000 characters)
            voice_id: Voice ID (e.g., "Wise_Woman", "Friendly_Person", etc.)
+            custom_voice_id: Custom voice clone ID for using cloned voice
            speed: Speech speed (0.5-2.0, default: 1.0)
            volume: Speech volume (0.1-10.0, default: 1.0)
            pitch: Speech pitch (-12 to 12, default: 0.0)
@@ -269,6 +271,7 @@ class WaveSpeedClient:
        return self.speech.generate_speech(
            text=text,
            voice_id=voice_id,
+            custom_voice_id=custom_voice_id,
            speed=speed,
            volume=volume,
            pitch=pitch,
--- a/backend/services/wavespeed/generators/speech.py
+++ b/backend/services/wavespeed/generators/speech.py
@@ -40,6 +40,7 @@ class SpeechGenerator:
        self,
        text: str,
        voice_id: str,
+        custom_voice_id: Optional[str] = None,
        speed: float = 1.0,
        volume: float = 1.0,
        pitch: float = 0.0,
@@ -54,6 +55,7 @@ class SpeechGenerator:
        Args:
            text: Text to convert to speech (max 10000 characters)
            voice_id: Voice ID (e.g., "Wise_Woman", "Friendly_Person", etc.)
+            custom_voice_id: Custom voice clone ID for using cloned voice
            speed: Speech speed (0.5-2.0, default: 1.0)
            volume: Speech volume (0.1-10.0, default: 1.0)
            pitch: Speech pitch (-12 to 12, default: 0.0)
@@ -77,6 +79,11 @@ class SpeechGenerator:
        if not sanitized_voice_id:
            raise ValueError("Voice ID cannot be empty after sanitization")
        
+        # Sanitize custom_voice_id if provided
+        sanitized_custom_voice_id = None
+        if custom_voice_id:
+            sanitized_custom_voice_id = str(custom_voice_id).strip() or None
+        
        # Ensure numeric parameters are proper floats and within valid ranges
        sanitized_speed = max(0.5, min(2.0, float(speed))) if speed is not None else 1.0
        sanitized_volume = max(0.1, min(10.0, float(volume))) if volume is not None else 1.0
@@ -112,6 +119,10 @@ class SpeechGenerator:
            "enable_sync_mode": bool(enable_sync_mode),
        }
        
+        # Add custom voice clone ID if provided
+        if sanitized_custom_voice_id:
+            payload["custom_voice_id"] = sanitized_custom_voice_id
+        
        # Add optional parameters with proper type validation
        optional_params = [
            "english_normalization",
@@ -179,6 +190,20 @@ class SpeechGenerator:
        
        if response.status_code != 200:
            logger.error(f"[WaveSpeed] Speech generation failed: {response.status_code} {response.text}")
+            
+            # Check for custom voice ID specific errors
+            response_text = response.text.lower()
+            if "custom_voice" in response_text or "voice_id" in response_text:
+                raise HTTPException(
+                    status_code=400,
+                    detail={
+                        "error": "Invalid voice clone ID",
+                        "message": "The custom voice ID is invalid or expired. Please create a new voice clone or use a predefined voice.",
+                        "status_code": response.status_code,
+                        "response": response.text,
+                    },
+                )
+            
            raise HTTPException(
                status_code=502,
                detail={
--- a/backend/services/wavespeed/infinitetalk.py
+++ b/backend/services/wavespeed/infinitetalk.py
@@ -26,20 +26,24 @@ def _generate_simple_infinitetalk_prompt(
    story_context: Dict[str, Any],
 ) -> Optional[str]:
    """
-    Generate a balanced, concise prompt for InfiniteTalk.
-    InfiniteTalk is audio-driven, so the prompt should describe the scene and suggest
-    subtle motion, but avoid overly elaborate cinematic descriptions.
+    Generate an enhanced prompt for InfiniteTalk video generation.
+    Includes scene content, analysis, bible context, and visual elements.
    
    Returns None if no meaningful prompt can be generated.
    """
    title = (scene_data.get("title") or "").strip()
    description = (scene_data.get("description") or "").strip()
    image_prompt = (scene_data.get("image_prompt") or "").strip()
+    lines = scene_data.get("lines", [])
+    narration = ""
+    if lines:
+        # Combine first few lines for context
+        narration = " ".join([str(l.get("text", "")) for l in lines[:3]])[:150]
    
-    # Build a balanced prompt: scene description + simple motion hint
+    # Build enhanced prompt with multiple context sources
    parts = []
    
-    # Add scene context
+    # Add main scene title
    if title and len(title) > 5 and title.lower() not in ("scene", "podcast", "episode"):
        parts.append(title)
    
@@ -48,60 +52,70 @@ def _generate_simple_infinitetalk_prompt(
    if analysis:
        content_type = analysis.get("content_type")
        if content_type:
-             parts.append(f"Style: {content_type}")
+            parts.append(f"Content type: {content_type}")
        
-        # Audience helps define the formality/vibe
+        # Add key takeaways if available
+        key_takeaways = analysis.get("keyTakeaways", [])
+        if key_takeaways and isinstance(key_takeaways, list) and len(key_takeaways) > 0:
+            takeaway = str(key_takeaways[0])[:80]
+            if takeaway:
+                parts.append(f"Key insight: {takeaway}")
+        
+        # Audience
        audience = analysis.get("audience")
        if audience:
-             # Just use first few words of audience to keep it short
-             short_audience = " ".join(audience.split()[:3])
-             parts.append(f"For: {short_audience}")
-
-    # Add bible context if available
+            short_audience = " ".join(audience.split()[:3])
+            parts.append(f"Target audience: {short_audience}")
+        
+        # Guest info
+        guest_name = analysis.get("guestName")
+        guest_expertise = analysis.get("guestExpertise")
+        if guest_name:
+            parts.append(f"Guest: {guest_name}")
+        if guest_expertise:
+            parts.append(f"Expertise: {guest_expertise}")
+    
+    # Add bible context
    bible = story_context.get("bible", {})
    if bible:
        host_persona = bible.get("host_persona")
        tone = bible.get("tone")
+        visual_style = bible.get("visual_style")
+        background = bible.get("background")
+        
        if host_persona:
-            parts.append(f"Host: {host_persona}")
+            parts.append(f"Host persona: {host_persona}")
        if tone:
            parts.append(f"Tone: {tone}")
-
-    elif description:
-        # Take first sentence or first 60 chars
-        desc_part = description.split('.')[0][:60].strip()
-        if desc_part:
-            parts.append(desc_part)
-    elif image_prompt:
-        # Take first sentence or first 60 chars
-        img_part = image_prompt.split('.')[0][:60].strip()
+        if visual_style:
+            parts.append(f"Visual style: {visual_style}")
+        if background:
+            parts.append(f"Background: {background}")
+    
+    # Add original image prompt as fallback context
+    if image_prompt and len(parts) < 3:
+        img_part = image_prompt.split('.')[0][:100].strip()
        if img_part:
-            parts.append(img_part)
+            parts.append(f"Visual context: {img_part}")
+    
+    # Add narration snippet if available
+    if narration and len(parts) < 4:
+        parts.append(f"Discussing: {narration}")
    
    if not parts:
        return None
    
-    # Add a simple, subtle motion suggestion (not elaborate camera movements)
-    # Keep it natural and audio-driven
-    motion_hints = [
-        "with subtle movement",
-        "with gentle motion",
-        "with natural animation",
-    ]
+    # Build prompt with visual quality keywords
+    quality_keywords = "Cinematic lighting, high detail, 4k quality, smooth motion"
    
-    # Combine scene description with subtle motion hint
-    if len(parts[0]) < 80:
-        # Room for a motion hint
-        prompt = f"{parts[0]}, {motion_hints[0]}"
-    else:
-        # Just use the description if it's already long enough
-        prompt = parts[0]
+    # Combine parts into final prompt
+    prompt = f"{'. '.join(parts)}. {quality_keywords}. With subtle natural movement."
    
-    # Keep it concise - max 120 characters (allows for scene + motion hint)
-    prompt = prompt[:120].strip()
+    # Allow more room for detailed prompts - max 350 characters
+    prompt = prompt[:350].strip()
    
-    # Clean up trailing commas or incomplete sentences
-    if prompt.endswith(','):
+    # Clean up trailing punctuation
+    if prompt.endswith(',') or prompt.endswith('.'):
        prompt = prompt[:-1].strip()
    
    return prompt if len(prompt) >= 15 else None