"""YouTube Creator scene audio generation handlers.""" from fastapi import APIRouter, Depends, HTTPException from fastapi.responses import FileResponse from sqlalchemy.orm import Session from typing import Dict, Any, Optional from pydantic import BaseModel from services.database import get_db from middleware.auth_middleware import get_current_user, get_current_user_with_query_token from api.story_writer.utils.auth import require_authenticated_user from utils.asset_tracker import save_asset_to_library from models.story_models import StoryAudioResult from services.story_writer.audio_generation_service import StoryAudioGenerationService from pathlib import Path from utils.logger_utils import get_service_logger router = APIRouter(tags=["youtube-audio"]) logger = get_service_logger("api.youtube.audio") # Audio output directory base_dir = Path(__file__).parent.parent.parent.parent YOUTUBE_AUDIO_DIR = base_dir / "youtube_audio" YOUTUBE_AUDIO_DIR.mkdir(parents=True, exist_ok=True) # Initialize audio service audio_service = StoryAudioGenerationService(output_dir=str(YOUTUBE_AUDIO_DIR)) # WaveSpeed Minimax Speech voice ids include language-specific voices # Ref: https://wavespeed.ai/docs/docs-api/minimax/minimax_speech_voice_id LANGUAGE_CODE_TO_LANGUAGE_BOOST = { "en": "English", "es": "Spanish", "fr": "French", "de": "German", "pt": "Portuguese", "it": "Italian", "hi": "Hindi", "ar": "Arabic", "ru": "Russian", "ja": "Japanese", "ko": "Korean", "zh": "Chinese", "vi": "Vietnamese", "id": "Indonesian", "tr": "Turkish", "nl": "Dutch", "pl": "Polish", "th": "Thai", "uk": "Ukrainian", "el": "Greek", "cs": "Czech", "fi": "Finnish", "ro": "Romanian", } # Default language-specific Minimax voices (first-choice). We keep English on the existing "persona" voices. LANGUAGE_BOOST_TO_DEFAULT_VOICE_ID = { "Spanish": "Spanish_male_1_v1", "French": "French_male_1_v1", "German": "German_male_1_v1", "Portuguese": "Portuguese_male_1_v1", "Italian": "Italian_male_1_v1", "Hindi": "Hindi_male_1_v1", "Arabic": "Arabic_male_1_v1", "Russian": "Russian_male_1_v1", "Japanese": "Japanese_male_1_v1", "Korean": "Korean_male_1_v1", "Chinese": "Chinese_male_1_v1", "Vietnamese": "Vietnamese_male_1_v1", "Indonesian": "Indonesian_male_1_v1", "Turkish": "Turkish_male_1_v1", "Dutch": "Dutch_male_1_v1", "Polish": "Polish_male_1_v1", "Thai": "Thai_male_1_v1", "Ukrainian": "Ukrainian_male_1_v1", "Greek": "Greek_male_1_v1", "Czech": "Czech_male_1_v1", "Finnish": "Finnish_male_1_v1", "Romanian": "Romanian_male_1_v1", } def _resolve_language_boost(language: Optional[str], explicit_language_boost: Optional[str]) -> str: """ Determine the effective WaveSpeed `language_boost`. - If user explicitly provided language_boost, use it (including "auto"). - Else if language code provided, map to the WaveSpeed boost label. - Else default to English (backwards compatible). """ if explicit_language_boost is not None and str(explicit_language_boost).strip() != "": return str(explicit_language_boost).strip() if language is not None and str(language).strip() != "": lang_code = str(language).strip().lower() return LANGUAGE_CODE_TO_LANGUAGE_BOOST.get(lang_code, "auto") return "English" def select_optimal_emotion(scene_title: str, narration: str, video_plan_context: Optional[Dict[str, Any]] = None) -> str: """ Intelligently select the best emotion for YouTube content based on scene analysis. Available emotions: "happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral" Returns the selected emotion string. """ # Default to happy for engaging YouTube content selected_emotion = "happy" scene_text = f"{scene_title} {narration}".lower() # Hook scenes need excitement and energy if "hook" in scene_title.lower() or any(word in scene_text for word in ["exciting", "amazing", "unbelievable", "shocking", "wow"]): selected_emotion = "surprised" # Excited and attention-grabbing # Emotional stories or inspirational content elif any(word in scene_text for word in ["emotional", "touching", "heartwarming", "inspiring", "motivational"]): selected_emotion = "happy" # Warm and uplifting # Serious or professional content elif any(word in scene_text for word in ["important", "critical", "serious", "professional", "expert"]): selected_emotion = "neutral" # Professional and serious # Problem-solving or tutorial content elif any(word in scene_text for word in ["problem", "solution", "fix", "help", "guide"]): selected_emotion = "happy" # Helpful and encouraging # Call-to-action scenes elif "cta" in scene_title.lower() or any(word in scene_text for word in ["subscribe", "like", "comment", "share", "action"]): selected_emotion = "happy" # Confident and encouraging # Negative or concerning topics elif any(word in scene_text for word in ["warning", "danger", "risk", "problem", "issue"]): selected_emotion = "neutral" # Serious but not alarming # Check video plan context for overall tone if video_plan_context: tone = video_plan_context.get("tone", "").lower() if "serious" in tone or "professional" in tone: selected_emotion = "neutral" elif "fun" in tone or "entertaining" in tone: selected_emotion = "happy" return selected_emotion def select_optimal_voice(scene_title: str, narration: str, video_plan_context: Optional[Dict[str, Any]] = None) -> str: """ Intelligently select the best voice for YouTube content based on scene analysis. Analyzes scene title, narration content, and video plan context to choose the most appropriate voice from available Minimax voices. Available voices: Wise_Woman, Friendly_Person, Inspirational_girl, Deep_Voice_Man, Calm_Woman, Casual_Guy, Lively_Girl, Patient_Man, Young_Knight, Determined_Man, Lovely_Girl, Decent_Boy, Imposing_Manner, Elegant_Man, Abbess, Sweet_Girl_2, Exuberant_Girl Returns the selected voice_id string. """ # Default to Casual_Guy for engaging YouTube content selected_voice = "Casual_Guy" # Analyze video plan context for content type if video_plan_context: video_type = video_plan_context.get("video_type", "").lower() target_audience = video_plan_context.get("target_audience", "").lower() tone = video_plan_context.get("tone", "").lower() # Educational/Professional content if any(keyword in video_type for keyword in ["tutorial", "educational", "how-to", "guide", "course"]): if "professional" in tone or "expert" in target_audience: selected_voice = "Wise_Woman" # Authoritative and trustworthy else: selected_voice = "Patient_Man" # Clear and instructional # Entertainment/Casual content elif any(keyword in video_type for keyword in ["entertainment", "vlog", "lifestyle", "story", "review"]): if "young" in target_audience or "millennial" in target_audience: selected_voice = "Casual_Guy" # Friendly and relatable elif "female" in target_audience or "women" in target_audience: selected_voice = "Lively_Girl" # Energetic and engaging else: selected_voice = "Friendly_Person" # Approachable # Motivational/Inspirational content elif any(keyword in video_type for keyword in ["motivational", "inspirational", "success", "mindset"]): selected_voice = "Inspirational_girl" # Uplifting and motivational # Business/Corporate content elif any(keyword in video_type for keyword in ["business", "corporate", "finance", "marketing"]): selected_voice = "Elegant_Man" # Professional and sophisticated # Tech/Gaming content elif any(keyword in video_type for keyword in ["tech", "gaming", "software", "app"]): selected_voice = "Young_Knight" # Energetic and modern # Analyze scene content for specific voice requirements scene_text = f"{scene_title} {narration}".lower() # Hook scenes need energetic, attention-grabbing voices if "hook" in scene_title.lower() or any(word in scene_text for word in ["attention", "grab", "exciting", "amazing", "unbelievable"]): selected_voice = "Exuberant_Girl" # Very energetic and enthusiastic # Emotional/stories need more expressive voices elif any(word in scene_text for word in ["story", "emotional", "heartwarming", "touching", "inspiring"]): selected_voice = "Inspirational_girl" # Emotional and inspiring # Technical explanations need clear, precise voices elif any(word in scene_text for word in ["technical", "explain", "step-by-step", "process", "how-to"]): selected_voice = "Calm_Woman" # Clear and methodical # Call-to-action scenes need confident, persuasive voices elif "cta" in scene_title.lower() or any(word in scene_text for word in ["subscribe", "like", "comment", "share", "now", "today"]): selected_voice = "Determined_Man" # Confident and persuasive logger.info(f"[VoiceSelection] Selected '{selected_voice}' for scene: {scene_title[:50]}...") return selected_voice class YouTubeAudioRequest(BaseModel): scene_id: str scene_title: str text: str voice_id: Optional[str] = None # Will auto-select based on content if not provided language: Optional[str] = None # Language code for multilingual audio (e.g., "en", "es", "fr") speed: float = 1.0 volume: float = 1.0 pitch: float = 0.0 emotion: str = "happy" # More engaging for YouTube content english_normalization: bool = False # Enhanced defaults for high-quality YouTube audio using Minimax Speech 02 HD # Higher quality settings for professional YouTube content sample_rate: Optional[int] = 44100 # CD quality: 44100 Hz (valid values: 8000, 16000, 22050, 24000, 32000, 44100) bitrate: int = 256000 # Highest quality: 256kbps (valid values: 32000, 64000, 128000, 256000) channel: Optional[str] = "2" # Stereo for richer audio (valid values: "1" or "2") format: Optional[str] = "mp3" # Universal format for web language_boost: Optional[str] = None # If not provided, inferred from `language` (or defaults to English) enable_sync_mode: bool = True # Context for intelligent voice/emotion selection video_plan_context: Optional[Dict[str, Any]] = None # Optional video plan for context-aware voice selection class YouTubeAudioResponse(BaseModel): scene_id: str scene_title: str audio_filename: str audio_url: str provider: str model: str voice_id: str text_length: int file_size: int cost: float @router.post("/audio", response_model=YouTubeAudioResponse) async def generate_youtube_scene_audio( request: YouTubeAudioRequest, current_user: Dict[str, Any] = Depends(get_current_user), db: Session = Depends(get_db), ): """ Generate AI audio for a YouTube scene using shared audio service. Similar to Podcast's audio generation endpoint. """ user_id = require_authenticated_user(current_user) if not request.text or not request.text.strip(): raise HTTPException(status_code=400, detail="Text is required") try: # Preprocess text to remove instructional markers that shouldn't be spoken # Remove patterns like [Pacing: slow], [Instructions: ...], etc. import re processed_text = request.text.strip() # Remove instructional markers that contain pacing, timing, or other non-spoken content instructional_patterns = [ r'\[Pacing:\s*[^\]]+\]', # [Pacing: slow] r'\[Instructions?:\s*[^\]]+\]', # [Instructions: ...] r'\[Timing:\s*[^\]]+\]', # [Timing: ...] r'\[Note:\s*[^\]]+\]', # [Note: ...] r'\[Internal:\s*[^\]]+\]', # [Internal: ...] ] for pattern in instructional_patterns: processed_text = re.sub(pattern, '', processed_text, flags=re.IGNORECASE) # Clean up extra whitespace and normalize processed_text = re.sub(r'\s+', ' ', processed_text).strip() if not processed_text: raise HTTPException(status_code=400, detail="Text became empty after removing instructions. Please provide clean narration text.") logger.info(f"[YouTubeAudio] Text preprocessing: {len(request.text)} -> {len(processed_text)} characters") effective_language_boost = _resolve_language_boost(request.language, request.language_boost) # Intelligent voice and emotion selection based on content analysis if not request.voice_id: # If non-English language is selected, default to the language-specific Minimax voice_id. # Otherwise keep the existing English persona voice selection logic. if effective_language_boost in LANGUAGE_BOOST_TO_DEFAULT_VOICE_ID and effective_language_boost not in ["English", "auto"]: selected_voice = LANGUAGE_BOOST_TO_DEFAULT_VOICE_ID[effective_language_boost] logger.info( f"[VoiceSelection] Using language-specific default voice '{selected_voice}' " f"(language_boost={effective_language_boost}, language={request.language})" ) else: selected_voice = select_optimal_voice( request.scene_title, processed_text, request.video_plan_context ) else: selected_voice = request.voice_id # Auto-select emotion if not specified or if using defaults if request.emotion == "happy": # This means it wasn't specifically set by user selected_emotion = select_optimal_emotion( request.scene_title, processed_text, request.video_plan_context ) else: selected_emotion = request.emotion logger.info( f"[YouTubeAudio] Voice selection: {selected_voice}, Emotion: {selected_emotion}, " f"language={request.language}, language_boost={effective_language_boost}" ) # Build kwargs for optional parameters - use defaults if None # WaveSpeed API requires specific values, so we provide sensible defaults # This matches Podcast's approach but with explicit defaults to avoid None errors optional_kwargs = {} # DEBUG: Log what values we received logger.info( f"[YouTubeAudio] Request parameters: sample_rate={request.sample_rate}, bitrate={request.bitrate}, " f"channel={request.channel}, format={request.format}, language_boost={request.language_boost}, " f"effective_language_boost={effective_language_boost}, language={request.language}" ) # sample_rate: Use provided value or omit (WaveSpeed will use default) if request.sample_rate is not None: optional_kwargs["sample_rate"] = request.sample_rate # bitrate: Always provide a value (default: 128000 = 128kbps) # Valid values: 32000, 64000, 128000, 256000 # Model already has default of 128000, so request.bitrate will never be None optional_kwargs["bitrate"] = request.bitrate # channel: Only include if valid (WaveSpeed only accepts "1" or "2" as strings) # If None, empty string, or invalid, omit it and WaveSpeed will use default # NEVER include channel if it's not exactly "1" or "2" if request.channel is not None and str(request.channel).strip() in ["1", "2"]: optional_kwargs["channel"] = str(request.channel).strip() logger.info(f"[YouTubeAudio] Including valid channel: {optional_kwargs['channel']}") else: logger.info(f"[YouTubeAudio] Omitting invalid channel: {request.channel}") # format: Use provided value or omit (WaveSpeed will use default) if request.format is not None: optional_kwargs["format"] = request.format # language_boost: always send resolved value (improves pronunciation and helps multilingual voices) if effective_language_boost is not None and str(effective_language_boost).strip() != "": optional_kwargs["language_boost"] = effective_language_boost logger.info(f"[YouTubeAudio] Final optional_kwargs: {optional_kwargs}") result: StoryAudioResult = audio_service.generate_ai_audio( scene_number=0, scene_title=request.scene_title, text=processed_text, user_id=user_id, voice_id=selected_voice, speed=request.speed or 1.0, volume=request.volume or 1.0, pitch=request.pitch or 0.0, emotion=selected_emotion, english_normalization=request.english_normalization or False, enable_sync_mode=request.enable_sync_mode, **optional_kwargs, ) # Override URL to use YouTube endpoint instead of story endpoint if result.get("audio_url") and "/api/story/audio/" in result.get("audio_url", ""): audio_filename = result.get("audio_filename", "") result["audio_url"] = f"/api/youtube/audio/{audio_filename}" except Exception as exc: logger.error(f"[YouTube] Audio generation failed: {exc}", exc_info=True) raise HTTPException(status_code=500, detail=f"Audio generation failed: {exc}") # Save to asset library (youtube_creator module) try: if result.get("audio_url"): save_asset_to_library( db=db, user_id=user_id, asset_type="audio", source_module="youtube_creator", filename=result.get("audio_filename", ""), file_url=result.get("audio_url", ""), file_path=result.get("audio_path"), file_size=result.get("file_size"), mime_type="audio/mpeg", title=f"{request.scene_title} - YouTube", description="YouTube scene narration", tags=["youtube_creator", "audio", request.scene_id], provider=result.get("provider"), model=result.get("model"), cost=result.get("cost"), asset_metadata={ "scene_id": request.scene_id, "scene_title": request.scene_title, "status": "completed", }, ) except Exception as e: logger.warning(f"[YouTube] Failed to save audio asset: {e}") return YouTubeAudioResponse( scene_id=request.scene_id, scene_title=request.scene_title, audio_filename=result.get("audio_filename", ""), audio_url=result.get("audio_url", ""), provider=result.get("provider", "wavespeed"), model=result.get("model", "minimax/speech-02-hd"), voice_id=result.get("voice_id", selected_voice), text_length=result.get("text_length", len(request.text)), file_size=result.get("file_size", 0), cost=result.get("cost", 0.0), ) @router.get("/audio/{filename}") async def serve_youtube_audio( filename: str, current_user: Dict[str, Any] = Depends(get_current_user_with_query_token), ): """Serve generated YouTube scene audio files. Supports authentication via Authorization header or token query parameter. Query parameter is useful for HTML elements like