feat: podcast demo mode with ALWRITY_ENABLED_FEATURES support

- Add ALWRITY_ENABLED_FEATURES env var for feature gating
- Podcast-only mode: skip LLM bootstrap, scheduler, persona services
- Enhance video generation prompt with scene context, analysis, narration
- Add voice cloning support via custom_voice_id in WaveSpeed
- Add text-to-speech for research results (browser speechSynthesis)
- Fix render queue to sync images from script phase
- Add WaveSpeed LLM pricing (gpt-oss-120b)
- Fix podcast bible generation error handling
- Refactor RouterManager for feature-based router loading
This commit is contained in:
ajaysi
2026-04-03 06:59:59 +05:30
parent c52b1eabc9
commit 63bb937796
58 changed files with 3568 additions and 1597 deletions

View File

@@ -62,6 +62,7 @@ class VoiceCloneResult:
def generate_audio(
text: str,
voice_id: str = "Wise_Woman",
custom_voice_id: Optional[str] = None,
speed: float = 1.0,
volume: float = 1.0,
pitch: float = 0.0,
@@ -173,6 +174,7 @@ def generate_audio(
audio_bytes = client.generate_speech(
text=text,
voice_id=voice_id,
custom_voice_id=custom_voice_id,
speed=speed,
volume=volume,
pitch=pitch,

View File

@@ -67,7 +67,7 @@ def llm_text_gen(
resolved_flow_type = flow_type or ("sif_agent" if preferred_hf_models else "premium_tool")
flow_tag = f"flow_type={resolved_flow_type}"
logger.info(f"[llm_text_gen][{flow_tag}] Starting text generation")
logger.warning(f"[llm_text_gen][{flow_tag}] Starting text generation")
logger.debug(f"[llm_text_gen] Prompt length: {len(prompt)} characters")
# Set default values for LLM parameters
@@ -94,7 +94,7 @@ def llm_text_gen(
primary_provider = provider_list[0]
if primary_provider in ['wavespeed', 'wave']:
gpt_provider = "wavespeed"
model = os.getenv('WAVESPEED_TEXT_MODEL', 'openai/gpt-oss-120b:cerebras')
model = os.getenv('WAVESPEED_TEXT_MODEL', 'openai/gpt-oss-120b')
elif primary_provider in ['gemini', 'google']:
gpt_provider = "google"
model = "gemini-2.0-flash-001"
@@ -111,7 +111,7 @@ def llm_text_gen(
elif preferred_provider:
if preferred_provider in ['wavespeed', 'wave']:
gpt_provider = "wavespeed"
model = os.getenv('WAVESPEED_TEXT_MODEL', 'openai/gpt-oss-120b:cerebras')
model = os.getenv('WAVESPEED_TEXT_MODEL', 'openai/gpt-oss-120b')
elif preferred_provider in ['openai', 'gpt']:
gpt_provider = "openai"
model = os.getenv('OPENAI_MODEL', 'gpt-4o-mini')
@@ -166,7 +166,7 @@ def llm_text_gen(
if api_key_manager.get_api_key("wavespeed"):
available_providers.append("wavespeed")
logger.info(
logger.warning(
f"[llm_text_gen][{flow_tag}] Provider preflight: env_provider='{env_provider or 'auto'}', "
f"provider_list={provider_list}, strict_provider_mode={strict_provider_mode}, "
f"available_providers={available_providers}, preferred_provider={preferred_provider or 'none'}, "
@@ -278,7 +278,12 @@ def llm_text_gen(
UsageSummary.billing_period == current_period
).first()
# No separate log here - we'll create unified log after API call and usage tracking
# Log subscription details before making the API call
if usage:
total_llm_calls = (usage.gemini_calls or 0) + (usage.openai_calls or 0) + (usage.anthropic_calls or 0) + (usage.mistral_calls or 0) + (usage.wavespeed_calls or 0)
logger.info(f"[llm_text_gen] Subscription check passed for user {user_id}: provider={actual_provider_name or gpt_provider}, tokens_requested={estimated_total_tokens}, current_usage=${usage.total_cost or 0:.4f}, calls_used={total_llm_calls}")
else:
logger.info(f"[llm_text_gen] Subscription check passed for user {user_id}: provider={actual_provider_name or gpt_provider}, tokens_requested={estimated_total_tokens}, new_user_no_usage_record")
finally:
db.close()
@@ -363,7 +368,7 @@ def llm_text_gen(
from services.llm_providers.wavespeed_provider import wavespeed_text_response
response_text = wavespeed_text_response(
prompt=prompt,
model=model or "openai/gpt-oss-120b:cerebras",
model=model or "openai/gpt-oss-120b",
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,

View File

@@ -15,14 +15,31 @@ class PodcastBibleService:
"""Service for generating and managing the Podcast Bible."""
def __init__(self):
self.personalization_service = PersonalizationService()
try:
from services.product_marketing.personalization_service import PersonalizationService
self.personalization_service = PersonalizationService()
except Exception as e:
logger.warning(f"Failed to initialize PersonalizationService: {e}")
self.personalization_service = None
def generate_bible(self, user_id: str, project_id: str) -> PodcastBible:
"""Generate a Podcast Bible from onboarding data."""
logger.info(f"Generating Podcast Bible for user {user_id}")
try:
preferences = self.personalization_service.get_user_preferences(user_id) or {}
if not self.personalization_service:
logger.warning("PersonalizationService not available, using default bible")
return self._get_default_bible(project_id)
try:
preferences = self.personalization_service.get_user_preferences(user_id)
except Exception as pref_err:
logger.warning(f"Failed to get user preferences: {pref_err}, using defaults")
return self._get_default_bible(project_id)
if not preferences:
logger.info(f"No preferences found for user {user_id}, using defaults")
return self._get_default_bible(project_id)
if not isinstance(preferences, dict):
logger.warning(f"Podcast Bible preferences payload is non-dict for user {user_id}, using defaults")
preferences = {}
@@ -129,18 +146,23 @@ class PodcastBibleService:
name="AI Host",
background="Industry Professional",
expertise_level="Expert",
personality_traits=["Professional", "Informative"],
vocal_style="Authoritative",
vocal_characteristics=["Deep", "Steady"]
vocal_characteristics=["Deep", "Steady"],
look="A professional individual dressed in business-casual attire."
),
audience=AudienceDNA(
expertise_level="Intermediate",
interests=["Industry Trends", "Technology"],
pain_points=["Staying Competitive", "Operational Efficiency"]
pain_points=["Staying Competitive", "Operational Efficiency"],
demographics=None
),
brand=BrandDNA(
industry="General Business",
tone="Professional",
communication_style="Analytical"
communication_style="Analytical",
key_messages=[],
competitor_context=None
),
visual_style=VisualStyle(
environment="Professional modern office studio",

View File

@@ -156,6 +156,12 @@ def _check_production_api_key_loading(
if deploy_env == "local":
_record_check(checks, "production_api_key_loading", True, "skipped in local deploy mode")
return
# Also skip in podcast-only mode (no production API keys needed)
enabled_features = os.getenv("ALWRITY_ENABLED_FEATURES", "all").strip().lower()
if enabled_features == "podcast":
_record_check(checks, "production_api_key_loading", True, "skipped in podcast-only mode")
return
test_tenant_id = os.getenv("ALWRITY_STARTUP_TEST_TENANT_ID", "").strip()
if not test_tenant_id:

View File

@@ -46,6 +46,7 @@ class StoryAudioGenerationService:
return _get_story_media_write_dir("audio", user_id=user_id, db=db)
except Exception as e:
logger.warning(f"[StoryAudioGeneration] Failed to resolve user workspace path for {user_id}: {e}")
# Don't fall back to default - keep using the already-set output_dir for podcast
return self.output_dir
def _generate_audio_filename(self, scene_number: int, scene_title: str) -> str:
@@ -318,6 +319,7 @@ class StoryAudioGenerationService:
text: str,
user_id: str,
voice_id: str = "Wise_Woman",
custom_voice_id: Optional[str] = None,
speed: float = 1.0,
volume: float = 1.0,
pitch: float = 0.0,
@@ -364,6 +366,7 @@ class StoryAudioGenerationService:
result = generate_audio(
text=text.strip(),
voice_id=voice_id,
custom_voice_id=custom_voice_id,
speed=speed,
volume=volume,
pitch=pitch,
@@ -378,8 +381,8 @@ class StoryAudioGenerationService:
enable_sync_mode=enable_sync_mode,
)
# Determine output directory (user workspace or default)
output_dir = self._get_user_audio_dir(user_id, db)
# Use the output_dir that was set when service was created (already handles podcast vs story)
output_dir = self.output_dir
# Save audio to file
audio_filename = self._generate_audio_filename(scene_number, scene_title)

View File

@@ -442,9 +442,34 @@ class PricingService:
"description": "AI Audio Generation default pricing"
}
]
# WaveSpeed LLM Text Generation Pricing (via Cerebras)
wavespeed_llm_pricing = [
{
"provider": APIProvider.WAVESPEED,
"model_name": "openai/gpt-oss-120b",
"cost_per_input_token": 0.0000006, # $0.60 per 1M input tokens
"cost_per_output_token": 0.0000006, # $0.60 per 1M output tokens
"description": "WaveSpeed GPT-OSS 120B (Cerebras) - Fast text generation"
},
{
"provider": APIProvider.WAVESPEED,
"model_name": "openai/gpt-oss-120b:cerebras",
"cost_per_input_token": 0.0000006,
"cost_per_output_token": 0.0000006,
"description": "WaveSpeed GPT-OSS 120B (Cerebras) - Fast text generation"
},
{
"provider": APIProvider.WAVESPEED,
"model_name": "openai/gpt-oss-20b",
"cost_per_input_token": 0.0000002, # $0.20 per 1M input tokens
"cost_per_output_token": 0.0000002, # $0.20 per 1M output tokens
"description": "WaveSpeed GPT-OSS 20B (Cerebras) - Cost-effective text generation"
},
]
# Combine all pricing data (include video pricing in search_pricing list)
all_pricing = gemini_pricing + openai_pricing + anthropic_pricing + mistral_pricing + search_pricing
all_pricing = gemini_pricing + openai_pricing + anthropic_pricing + mistral_pricing + search_pricing + wavespeed_llm_pricing
# Insert or update pricing data
for pricing_data in all_pricing:

View File

@@ -241,6 +241,7 @@ class WaveSpeedClient:
self,
text: str,
voice_id: str,
custom_voice_id: Optional[str] = None,
speed: float = 1.0,
volume: float = 1.0,
pitch: float = 0.0,
@@ -255,6 +256,7 @@ class WaveSpeedClient:
Args:
text: Text to convert to speech (max 10000 characters)
voice_id: Voice ID (e.g., "Wise_Woman", "Friendly_Person", etc.)
custom_voice_id: Custom voice clone ID for using cloned voice
speed: Speech speed (0.5-2.0, default: 1.0)
volume: Speech volume (0.1-10.0, default: 1.0)
pitch: Speech pitch (-12 to 12, default: 0.0)
@@ -269,6 +271,7 @@ class WaveSpeedClient:
return self.speech.generate_speech(
text=text,
voice_id=voice_id,
custom_voice_id=custom_voice_id,
speed=speed,
volume=volume,
pitch=pitch,

View File

@@ -40,6 +40,7 @@ class SpeechGenerator:
self,
text: str,
voice_id: str,
custom_voice_id: Optional[str] = None,
speed: float = 1.0,
volume: float = 1.0,
pitch: float = 0.0,
@@ -54,6 +55,7 @@ class SpeechGenerator:
Args:
text: Text to convert to speech (max 10000 characters)
voice_id: Voice ID (e.g., "Wise_Woman", "Friendly_Person", etc.)
custom_voice_id: Custom voice clone ID for using cloned voice
speed: Speech speed (0.5-2.0, default: 1.0)
volume: Speech volume (0.1-10.0, default: 1.0)
pitch: Speech pitch (-12 to 12, default: 0.0)
@@ -77,6 +79,11 @@ class SpeechGenerator:
if not sanitized_voice_id:
raise ValueError("Voice ID cannot be empty after sanitization")
# Sanitize custom_voice_id if provided
sanitized_custom_voice_id = None
if custom_voice_id:
sanitized_custom_voice_id = str(custom_voice_id).strip() or None
# Ensure numeric parameters are proper floats and within valid ranges
sanitized_speed = max(0.5, min(2.0, float(speed))) if speed is not None else 1.0
sanitized_volume = max(0.1, min(10.0, float(volume))) if volume is not None else 1.0
@@ -112,6 +119,10 @@ class SpeechGenerator:
"enable_sync_mode": bool(enable_sync_mode),
}
# Add custom voice clone ID if provided
if sanitized_custom_voice_id:
payload["custom_voice_id"] = sanitized_custom_voice_id
# Add optional parameters with proper type validation
optional_params = [
"english_normalization",
@@ -179,6 +190,20 @@ class SpeechGenerator:
if response.status_code != 200:
logger.error(f"[WaveSpeed] Speech generation failed: {response.status_code} {response.text}")
# Check for custom voice ID specific errors
response_text = response.text.lower()
if "custom_voice" in response_text or "voice_id" in response_text:
raise HTTPException(
status_code=400,
detail={
"error": "Invalid voice clone ID",
"message": "The custom voice ID is invalid or expired. Please create a new voice clone or use a predefined voice.",
"status_code": response.status_code,
"response": response.text,
},
)
raise HTTPException(
status_code=502,
detail={

View File

@@ -26,20 +26,24 @@ def _generate_simple_infinitetalk_prompt(
story_context: Dict[str, Any],
) -> Optional[str]:
"""
Generate a balanced, concise prompt for InfiniteTalk.
InfiniteTalk is audio-driven, so the prompt should describe the scene and suggest
subtle motion, but avoid overly elaborate cinematic descriptions.
Generate an enhanced prompt for InfiniteTalk video generation.
Includes scene content, analysis, bible context, and visual elements.
Returns None if no meaningful prompt can be generated.
"""
title = (scene_data.get("title") or "").strip()
description = (scene_data.get("description") or "").strip()
image_prompt = (scene_data.get("image_prompt") or "").strip()
lines = scene_data.get("lines", [])
narration = ""
if lines:
# Combine first few lines for context
narration = " ".join([str(l.get("text", "")) for l in lines[:3]])[:150]
# Build a balanced prompt: scene description + simple motion hint
# Build enhanced prompt with multiple context sources
parts = []
# Add scene context
# Add main scene title
if title and len(title) > 5 and title.lower() not in ("scene", "podcast", "episode"):
parts.append(title)
@@ -48,60 +52,70 @@ def _generate_simple_infinitetalk_prompt(
if analysis:
content_type = analysis.get("content_type")
if content_type:
parts.append(f"Style: {content_type}")
parts.append(f"Content type: {content_type}")
# Audience helps define the formality/vibe
# Add key takeaways if available
key_takeaways = analysis.get("keyTakeaways", [])
if key_takeaways and isinstance(key_takeaways, list) and len(key_takeaways) > 0:
takeaway = str(key_takeaways[0])[:80]
if takeaway:
parts.append(f"Key insight: {takeaway}")
# Audience
audience = analysis.get("audience")
if audience:
# Just use first few words of audience to keep it short
short_audience = " ".join(audience.split()[:3])
parts.append(f"For: {short_audience}")
# Add bible context if available
short_audience = " ".join(audience.split()[:3])
parts.append(f"Target audience: {short_audience}")
# Guest info
guest_name = analysis.get("guestName")
guest_expertise = analysis.get("guestExpertise")
if guest_name:
parts.append(f"Guest: {guest_name}")
if guest_expertise:
parts.append(f"Expertise: {guest_expertise}")
# Add bible context
bible = story_context.get("bible", {})
if bible:
host_persona = bible.get("host_persona")
tone = bible.get("tone")
visual_style = bible.get("visual_style")
background = bible.get("background")
if host_persona:
parts.append(f"Host: {host_persona}")
parts.append(f"Host persona: {host_persona}")
if tone:
parts.append(f"Tone: {tone}")
elif description:
# Take first sentence or first 60 chars
desc_part = description.split('.')[0][:60].strip()
if desc_part:
parts.append(desc_part)
elif image_prompt:
# Take first sentence or first 60 chars
img_part = image_prompt.split('.')[0][:60].strip()
if visual_style:
parts.append(f"Visual style: {visual_style}")
if background:
parts.append(f"Background: {background}")
# Add original image prompt as fallback context
if image_prompt and len(parts) < 3:
img_part = image_prompt.split('.')[0][:100].strip()
if img_part:
parts.append(img_part)
parts.append(f"Visual context: {img_part}")
# Add narration snippet if available
if narration and len(parts) < 4:
parts.append(f"Discussing: {narration}")
if not parts:
return None
# Add a simple, subtle motion suggestion (not elaborate camera movements)
# Keep it natural and audio-driven
motion_hints = [
"with subtle movement",
"with gentle motion",
"with natural animation",
]
# Build prompt with visual quality keywords
quality_keywords = "Cinematic lighting, high detail, 4k quality, smooth motion"
# Combine scene description with subtle motion hint
if len(parts[0]) < 80:
# Room for a motion hint
prompt = f"{parts[0]}, {motion_hints[0]}"
else:
# Just use the description if it's already long enough
prompt = parts[0]
# Combine parts into final prompt
prompt = f"{'. '.join(parts)}. {quality_keywords}. With subtle natural movement."
# Keep it concise - max 120 characters (allows for scene + motion hint)
prompt = prompt[:120].strip()
# Allow more room for detailed prompts - max 350 characters
prompt = prompt[:350].strip()
# Clean up trailing commas or incomplete sentences
if prompt.endswith(','):
# Clean up trailing punctuation
if prompt.endswith(',') or prompt.endswith('.'):
prompt = prompt[:-1].strip()
return prompt if len(prompt) >= 15 else None