feat(podcast): add pre-estimate endpoint, enhance cost estimator with multi-model support, cleanup alpha pricing seeding
- Add POST /podcast/pre-estimate endpoint for cost estimation before analysis - Enhance cost_estimator.py with multi-model support (gemini, audio, voice clone, image, video) - Add detailed cost breakdown (llm, audio, media costs + per-phase breakdown) - Remove redundant pricing seeding from init_alpha_subscription_tiers.py - Add SSOT pricing via PricingService.initialize_default_pricing() - Update TopicUrlInput tooltip to show estimate details - Add debug logging for pricing seeding and pre-estimate - Clean up verbose podcast mode debug logs in app.py
This commit is contained in:
@@ -3,6 +3,13 @@ Podcast cost estimation helpers.
|
||||
|
||||
Builds user-facing podcast estimates from the subscription pricing catalog
|
||||
instead of hard-coded frontend heuristics.
|
||||
|
||||
Supports multiple models for each component:
|
||||
- Audio TTS: minimax/speech-02-hd (default), qwen3-tts, cosyvoice-tts
|
||||
- Voice Clone: qwen3, cosyvoice, minimax
|
||||
- Image: qwen-image (default), ideogram-v3-turbo
|
||||
- Video: wan-2.5 (default), kling-v2.5, infinitetalk
|
||||
- LLM: gemini-2.5-flash (default)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -23,6 +30,7 @@ def _load_pricing(
|
||||
provider: APIProvider,
|
||||
preferred_model: str,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Load pricing for a provider and model, with fallback to default."""
|
||||
pricing = pricing_service.get_pricing_for_provider_model(provider, preferred_model)
|
||||
if pricing:
|
||||
return pricing
|
||||
@@ -30,6 +38,17 @@ def _load_pricing(
|
||||
return pricing_service.get_pricing_for_provider_model(provider, "default")
|
||||
|
||||
|
||||
# Default models used in podcast generation
|
||||
DEFAULT_MODELS = {
|
||||
"gemini": "gemini-2.5-flash",
|
||||
"exa": "exa-search",
|
||||
"audio_tts": "minimax/speech-02-hd",
|
||||
"voice_clone": "wavespeed-ai/qwen3-tts/voice-clone",
|
||||
"image": "qwen-image",
|
||||
"video": "wan-2.5",
|
||||
}
|
||||
|
||||
|
||||
def estimate_podcast_cost(
|
||||
*,
|
||||
db: Session,
|
||||
@@ -37,88 +56,150 @@ def estimate_podcast_cost(
|
||||
speakers: int,
|
||||
query_count: int,
|
||||
include_avatar_phase: bool = True,
|
||||
# Optional model overrides
|
||||
gemini_model: str = "gemini-2.5-flash",
|
||||
audio_tts_model: str = "minimax/speech-02-hd",
|
||||
voice_clone_engine: str = "qwen3",
|
||||
image_model: str = "qwen-image",
|
||||
video_model: str = "wan-2.5",
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Compute a backend estimate for podcast creation.
|
||||
|
||||
Returns None when pricing rows are unavailable so UI can display "Unavailable".
|
||||
|
||||
Supports customizable models for each component.
|
||||
Uses pricing_catalog for accurate cost calculation.
|
||||
"""
|
||||
pricing_service = PricingService(db)
|
||||
|
||||
gemini_pricing = _load_pricing(pricing_service, APIProvider.GEMINI, "gemini-2.5-flash")
|
||||
# Load pricing for each component and model
|
||||
gemini_pricing = _load_pricing(pricing_service, APIProvider.GEMINI, gemini_model)
|
||||
exa_pricing = _load_pricing(pricing_service, APIProvider.EXA, "exa-search")
|
||||
audio_pricing = _load_pricing(pricing_service, APIProvider.AUDIO, "minimax/speech-02-hd")
|
||||
video_pricing = _load_pricing(pricing_service, APIProvider.VIDEO, "default")
|
||||
image_pricing = _load_pricing(pricing_service, APIProvider.STABILITY, "qwen-image")
|
||||
|
||||
# Audio TTS pricing (minimax/speech-02-hd)
|
||||
audio_pricing = _load_pricing(pricing_service, APIProvider.AUDIO, audio_tts_model)
|
||||
|
||||
# Voice clone pricing (different engines)
|
||||
voice_clone_model = f"wavespeed-ai/{voice_clone_engine}-tts/voice-clone"
|
||||
voice_clone_pricing = _load_pricing(pricing_service, APIProvider.AUDIO, voice_clone_model)
|
||||
if not voice_clone_pricing:
|
||||
# Try alternate model names
|
||||
voice_clone_pricing = _load_pricing(pricing_service, APIProvider.AUDIO, f"{voice_clone_engine}/voice-clone")
|
||||
|
||||
# Image pricing (qwen-image or ideogram)
|
||||
image_pricing = _load_pricing(pricing_service, APIProvider.STABILITY, image_model)
|
||||
|
||||
# Video pricing (wan-2.5, kling, or infinitetalk)
|
||||
video_pricing = _load_pricing(pricing_service, APIProvider.VIDEO, video_model)
|
||||
|
||||
# Return None if critical pricing unavailable (fail fast)
|
||||
if not gemini_pricing:
|
||||
return None
|
||||
|
||||
# Configuration
|
||||
minutes = max(1, int(duration_minutes or 1))
|
||||
speaker_count = max(1, int(speakers or 1))
|
||||
research_queries = max(1, int(query_count or 1))
|
||||
|
||||
# Phase-level usage assumptions (token/request proxies for pre-creation estimate).
|
||||
# Token usage assumptions per phase
|
||||
analysis_input_tokens = 1800
|
||||
analysis_output_tokens = 1000
|
||||
research_synthesis_input_tokens = 2200
|
||||
research_synthesis_output_tokens = 900
|
||||
script_input_tokens = max(1800, minutes * 300)
|
||||
script_output_tokens = max(2200, minutes * 700)
|
||||
|
||||
# TTS token proxy: ~900 chars per minute per speaker.
|
||||
|
||||
# TTS: ~900 chars per minute per speaker
|
||||
estimated_tts_tokens = max(900, minutes * 900 * speaker_count)
|
||||
|
||||
# Voice clone: 1 clone operation per speaker
|
||||
voice_clone_count = speaker_count
|
||||
|
||||
# ===== COST CALCULATIONS =====
|
||||
|
||||
# 1. Analysis phase (LLM)
|
||||
analysis_cost = (
|
||||
analysis_input_tokens * float(gemini_pricing.get("cost_per_input_token") or 0.0)
|
||||
+ analysis_output_tokens * float(gemini_pricing.get("cost_per_output_token") or 0.0)
|
||||
+ float(gemini_pricing.get("cost_per_request") or 0.0)
|
||||
)
|
||||
|
||||
# 2. Research phase
|
||||
# 2a. LLM for research synthesis
|
||||
research_llm_cost = (
|
||||
research_synthesis_input_tokens * float(gemini_pricing.get("cost_per_input_token") or 0.0)
|
||||
+ research_synthesis_output_tokens * float(gemini_pricing.get("cost_per_output_token") or 0.0)
|
||||
+ float(gemini_pricing.get("cost_per_request") or 0.0)
|
||||
)
|
||||
script_cost = (
|
||||
script_input_tokens * float(gemini_pricing.get("cost_per_input_token") or 0.0)
|
||||
+ script_output_tokens * float(gemini_pricing.get("cost_per_output_token") or 0.0)
|
||||
+ float(gemini_pricing.get("cost_per_request") or 0.0)
|
||||
)
|
||||
|
||||
# 2b. Search API (Exa)
|
||||
research_search_cost = 0.0
|
||||
if exa_pricing:
|
||||
research_search_cost = research_queries * float(exa_pricing.get("cost_per_request") or 0.0)
|
||||
research_cost = research_search_cost + research_llm_cost
|
||||
|
||||
# 3. Script generation (LLM)
|
||||
script_cost = (
|
||||
script_input_tokens * float(gemini_pricing.get("cost_per_input_token") or 0.0)
|
||||
+ script_output_tokens * float(gemini_pricing.get("cost_per_output_token") or 0.0)
|
||||
)
|
||||
|
||||
# 4. Audio TTS
|
||||
tts_cost = 0.0
|
||||
if audio_pricing:
|
||||
tts_cost = (
|
||||
estimated_tts_tokens * float(audio_pricing.get("cost_per_input_token") or 0.0)
|
||||
+ float(audio_pricing.get("cost_per_request") or 0.0)
|
||||
tts_cost = estimated_tts_tokens * float(audio_pricing.get("cost_per_input_token") or 0.0)
|
||||
|
||||
# 5. Voice cloning (if needed)
|
||||
voice_clone_cost = 0.0
|
||||
if voice_clone_pricing:
|
||||
voice_clone_cost = voice_clone_count * (
|
||||
float(voice_clone_pricing.get("cost_per_request") or 0.0)
|
||||
+ estimated_tts_tokens * float(voice_clone_pricing.get("cost_per_input_token") or 0.0)
|
||||
)
|
||||
|
||||
# Assume one video render request per minute (upper-bound planning estimate).
|
||||
video_cost = 0.0
|
||||
if video_pricing:
|
||||
video_cost = minutes * float(video_pricing.get("cost_per_request") or 0.0)
|
||||
|
||||
# 6. Avatar image generation
|
||||
avatar_cost = 0.0
|
||||
if include_avatar_phase and image_pricing:
|
||||
image_unit = float(image_pricing.get("cost_per_image") or image_pricing.get("cost_per_request") or 0.0)
|
||||
avatar_cost = speaker_count * image_unit
|
||||
|
||||
research_cost = research_search_cost + research_llm_cost
|
||||
total = analysis_cost + research_cost + script_cost + tts_cost + video_cost + avatar_cost
|
||||
# 7. Video rendering
|
||||
video_cost = 0.0
|
||||
if video_pricing:
|
||||
# Assume 1 video render per minute (upper bound)
|
||||
video_cost = minutes * float(video_pricing.get("cost_per_request") or 0.0)
|
||||
|
||||
# ===== TOTALS =====
|
||||
llm_total = analysis_cost + research_llm_cost + script_cost
|
||||
audio_total = tts_cost + voice_clone_cost
|
||||
media_total = avatar_cost + video_cost
|
||||
total = llm_total + research_search_cost + audio_total + media_total
|
||||
|
||||
return {
|
||||
# Cost breakdown
|
||||
"analysisCost": _round_money(analysis_cost),
|
||||
"researchCost": _round_money(research_cost),
|
||||
"researchSearchCost": _round_money(research_search_cost),
|
||||
"researchLlmCost": _round_money(research_llm_cost),
|
||||
"scriptCost": _round_money(script_cost),
|
||||
"ttsCost": _round_money(tts_cost),
|
||||
"voiceCloneCost": _round_money(voice_clone_cost),
|
||||
"avatarCost": _round_money(avatar_cost),
|
||||
"videoCost": _round_money(video_cost),
|
||||
"researchCost": _round_money(research_cost),
|
||||
"analysisCost": _round_money(analysis_cost),
|
||||
"scriptCost": _round_money(script_cost),
|
||||
"total": _round_money(total),
|
||||
# Totals by category
|
||||
"llmCost": _round_money(llm_total),
|
||||
"audioCost": _round_money(audio_total),
|
||||
"mediaCost": _round_money(media_total),
|
||||
# Currency
|
||||
"currency": "USD",
|
||||
"source": "pricing_catalog",
|
||||
# Models used for this estimate
|
||||
"models": {
|
||||
"llm": gemini_model,
|
||||
"research": "exa-search",
|
||||
"audio_tts": audio_tts_model,
|
||||
"voice_clone": voice_clone_model,
|
||||
"image": image_model,
|
||||
"video": video_model,
|
||||
},
|
||||
# Assumptions used
|
||||
"assumptions": {
|
||||
"analysis_input_tokens": analysis_input_tokens,
|
||||
"analysis_output_tokens": analysis_output_tokens,
|
||||
@@ -128,6 +209,8 @@ def estimate_podcast_cost(
|
||||
"script_output_tokens": script_output_tokens,
|
||||
"estimated_tts_tokens": estimated_tts_tokens,
|
||||
"research_queries": research_queries,
|
||||
"voice_clone_count": voice_clone_count,
|
||||
"video_requests": minutes,
|
||||
"avatar_requests": speaker_count if include_avatar_phase else 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user