ALwrity/backend/api/podcast/cost_estimator.py

"""
Podcast cost estimation helpers.

Builds user-facing podcast estimates from the subscription pricing catalog
instead of hard-coded frontend heuristics.

Supports multiple models for each component:
- Audio TTS: minimax/speech-02-hd (default), qwen3-tts, cosyvoice-tts
- Voice Clone: qwen3, cosyvoice, minimax
- Image: qwen-image (default), ideogram-v3-turbo
- Video: wan-2.5 (default), kling-v2.5, infinitetalk
- LLM: gemini-2.5-flash (default)
"""

from __future__ import annotations

from typing import Any, Dict, Optional
from sqlalchemy.orm import Session

from models.subscription_models import APIProvider
from services.subscription.pricing_service import PricingService


def _round_money(value: float) -> float:
    return round(float(value), 4)


def _load_pricing(
    pricing_service: PricingService,
    provider: APIProvider,
    preferred_model: str,
) -> Optional[Dict[str, Any]]:
    """Load pricing for a provider and model, with fallback to default."""
    pricing = pricing_service.get_pricing_for_provider_model(provider, preferred_model)
    if pricing:
        return pricing
    # Fallback to provider default model row (if configured).
    return pricing_service.get_pricing_for_provider_model(provider, "default")


# Default models used in podcast generation
DEFAULT_MODELS = {
    "gemini": "gemini-2.5-flash",
    "exa": "exa-search",
    "audio_tts": "minimax/speech-02-hd",
    "voice_clone": "wavespeed-ai/qwen3-tts/voice-clone",
    "image": "qwen-image",
    "video": "wan-2.5",
}


def estimate_podcast_cost(
    *,
    db: Session,
    duration_minutes: int,
    speakers: int,
    query_count: int,
    include_avatar_phase: bool = True,
    # Optional model overrides
    gemini_model: str = "gemini-2.5-flash",
    audio_tts_model: str = "minimax/speech-02-hd",
    voice_clone_engine: str = "qwen3",
    image_model: str = "qwen-image",
    video_model: str = "wan-2.5",
) -> Optional[Dict[str, Any]]:
    """
    Compute a backend estimate for podcast creation.

    Supports customizable models for each component.
    Uses pricing_catalog for accurate cost calculation.
    """
    pricing_service = PricingService(db)

    # Load pricing for each component and model
    gemini_pricing = _load_pricing(pricing_service, APIProvider.GEMINI, gemini_model)
    exa_pricing = _load_pricing(pricing_service, APIProvider.EXA, "exa-search")

    # Audio TTS pricing (minimax/speech-02-hd)
    audio_pricing = _load_pricing(pricing_service, APIProvider.AUDIO, audio_tts_model)

    # Voice clone pricing (different engines)
    voice_clone_model = f"wavespeed-ai/{voice_clone_engine}-tts/voice-clone"
    voice_clone_pricing = _load_pricing(pricing_service, APIProvider.AUDIO, voice_clone_model)
    if not voice_clone_pricing:
        # Try alternate model names
        voice_clone_pricing = _load_pricing(pricing_service, APIProvider.AUDIO, f"{voice_clone_engine}/voice-clone")

    # Image pricing (qwen-image or ideogram)
    image_pricing = _load_pricing(pricing_service, APIProvider.STABILITY, image_model)

    # Video pricing (wan-2.5, kling, or infinitetalk)
    video_pricing = _load_pricing(pricing_service, APIProvider.VIDEO, video_model)

    # Return None if critical pricing unavailable (fail fast)
    if not gemini_pricing:
        return None

    # Configuration
    minutes = max(1, int(duration_minutes or 1))
    speaker_count = max(1, int(speakers or 1))
    research_queries = max(1, int(query_count or 1))

    # Token usage assumptions per phase
    analysis_input_tokens = 1800
    analysis_output_tokens = 1000
    research_synthesis_input_tokens = 2200
    research_synthesis_output_tokens = 900
    script_input_tokens = max(1800, minutes * 300)
    script_output_tokens = max(2200, minutes * 700)

    # TTS: ~900 chars per minute per speaker
    estimated_tts_tokens = max(900, minutes * 900 * speaker_count)

    # Voice clone: 1 clone operation per speaker
    voice_clone_count = speaker_count

    # ===== COST CALCULATIONS =====

    # 1. Analysis phase (LLM)
    analysis_cost = (
        analysis_input_tokens * float(gemini_pricing.get("cost_per_input_token") or 0.0)
        + analysis_output_tokens * float(gemini_pricing.get("cost_per_output_token") or 0.0)
    )

    # 2. Research phase
    # 2a. LLM for research synthesis
    research_llm_cost = (
        research_synthesis_input_tokens * float(gemini_pricing.get("cost_per_input_token") or 0.0)
        + research_synthesis_output_tokens * float(gemini_pricing.get("cost_per_output_token") or 0.0)
    )
    # 2b. Search API (Exa)
    research_search_cost = 0.0
    if exa_pricing:
        research_search_cost = research_queries * float(exa_pricing.get("cost_per_request") or 0.0)
    research_cost = research_search_cost + research_llm_cost

    # 3. Script generation (LLM)
    script_cost = (
        script_input_tokens * float(gemini_pricing.get("cost_per_input_token") or 0.0)
        + script_output_tokens * float(gemini_pricing.get("cost_per_output_token") or 0.0)
    )

    # 4. Audio TTS
    tts_cost = 0.0
    if audio_pricing:
        tts_cost = estimated_tts_tokens * float(audio_pricing.get("cost_per_input_token") or 0.0)

    # 5. Voice cloning (if needed)
    voice_clone_cost = 0.0
    if voice_clone_pricing:
        voice_clone_cost = voice_clone_count * (
            float(voice_clone_pricing.get("cost_per_request") or 0.0)
            + estimated_tts_tokens * float(voice_clone_pricing.get("cost_per_input_token") or 0.0)
        )

    # 6. Avatar image generation
    avatar_cost = 0.0
    if include_avatar_phase and image_pricing:
        image_unit = float(image_pricing.get("cost_per_image") or image_pricing.get("cost_per_request") or 0.0)
        avatar_cost = speaker_count * image_unit

    # 7. Video rendering
    video_cost = 0.0
    if video_pricing:
        # Assume 1 video render per minute (upper bound)
        video_cost = minutes * float(video_pricing.get("cost_per_request") or 0.0)

    # ===== TOTALS =====
    llm_total = analysis_cost + research_llm_cost + script_cost
    audio_total = tts_cost + voice_clone_cost
    media_total = avatar_cost + video_cost
    total = llm_total + research_search_cost + audio_total + media_total

    return {
        # Cost breakdown
        "analysisCost": _round_money(analysis_cost),
        "researchCost": _round_money(research_cost),
        "researchSearchCost": _round_money(research_search_cost),
        "researchLlmCost": _round_money(research_llm_cost),
        "scriptCost": _round_money(script_cost),
        "ttsCost": _round_money(tts_cost),
        "voiceCloneCost": _round_money(voice_clone_cost),
        "avatarCost": _round_money(avatar_cost),
        "videoCost": _round_money(video_cost),
        "total": _round_money(total),
        # Totals by category
        "llmCost": _round_money(llm_total),
        "audioCost": _round_money(audio_total),
        "mediaCost": _round_money(media_total),
        # Currency
        "currency": "USD",
        "source": "pricing_catalog",
        # Models used for this estimate
        "models": {
            "llm": gemini_model,
            "research": "exa-search",
            "audio_tts": audio_tts_model,
            "voice_clone": voice_clone_model,
            "image": image_model,
            "video": video_model,
        },
        # Assumptions used
        "assumptions": {
            "analysis_input_tokens": analysis_input_tokens,
            "analysis_output_tokens": analysis_output_tokens,
            "research_synthesis_input_tokens": research_synthesis_input_tokens,
            "research_synthesis_output_tokens": research_synthesis_output_tokens,
            "script_input_tokens": script_input_tokens,
            "script_output_tokens": script_output_tokens,
            "estimated_tts_tokens": estimated_tts_tokens,
            "research_queries": research_queries,
            "voice_clone_count": voice_clone_count,
            "video_requests": minutes,
            "avatar_requests": speaker_count if include_avatar_phase else 0,
        },
    }