""" Audio Generation Service for Story Writer Generates audio narration for story scenes using TTS (Text-to-Speech) providers. """ import os import uuid from typing import List, Dict, Any, Optional from pathlib import Path from loguru import logger from fastapi import HTTPException from sqlalchemy.orm import Session def _get_story_media_write_dir(media_type: str, user_id: Optional[str] = None, db: Optional[Session] = None) -> Path: """Lazy import wrapper to avoid circular imports.""" from api.story_writer.utils.media_utils import get_story_media_write_dir return get_story_media_write_dir(media_type, user_id=user_id, db=db) class StoryAudioGenerationService: """Service for generating audio narration for story scenes.""" def __init__(self, output_dir: Optional[str] = None): """ Initialize the audio generation service. Parameters: output_dir (str, optional): Directory to save generated audio files. Defaults to canonical workspace media path if not provided. """ if output_dir: self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) else: self.output_dir = _get_story_media_write_dir("audio") logger.info(f"[StoryAudioGeneration] Initialized with output directory: {self.output_dir}") def _get_user_audio_dir(self, user_id: str, db: Optional[Session] = None) -> Path: """ Get the audio directory for a specific user. Falls back to default output_dir if workspace not found. """ try: return _get_story_media_write_dir("audio", user_id=user_id, db=db) except Exception as e: logger.warning(f"[StoryAudioGeneration] Failed to resolve user workspace path for {user_id}: {e}") # Don't fall back to default - keep using the already-set output_dir for podcast return self.output_dir def _generate_audio_filename(self, scene_number: int, scene_title: str) -> str: """Generate a unique filename for a scene audio file.""" # Clean scene title for filename clean_title = "".join(c if c.isalnum() or c in ('-', '_') else '_' for c in scene_title[:30]) unique_id = str(uuid.uuid4())[:8] return f"scene_{scene_number}_{clean_title}_{unique_id}.mp3" def _generate_audio_gtts( self, text: str, output_path: Path, lang: str = "en", slow: bool = False ) -> bool: """ Generate audio using Google Text-to-Speech (gTTS). Parameters: text (str): Text to convert to speech. output_path (Path): Path to save the audio file. lang (str): Language code (default: "en"). slow (bool): Whether to speak slowly (default: False). Returns: bool: True if generation was successful, False otherwise. """ try: from gtts import gTTS # Generate speech tts = gTTS(text=text, lang=lang, slow=slow) # Save to file tts.save(str(output_path)) logger.info(f"[StoryAudioGeneration] Generated audio using gTTS: {output_path}") return True except ImportError as e: logger.error(f"[StoryAudioGeneration] gTTS not installed. ImportError: {e}. Install with: pip install gtts") return False except Exception as e: logger.error(f"[StoryAudioGeneration] Error generating audio with gTTS: {type(e).__name__}: {e}") return False def _generate_audio_pyttsx3( self, text: str, output_path: Path, rate: int = 150, voice: Optional[str] = None ) -> bool: """ Generate audio using pyttsx3 (offline TTS). Parameters: text (str): Text to convert to speech. output_path (Path): Path to save the audio file. rate (int): Speech rate (default: 150). voice (str, optional): Voice ID to use. Returns: bool: True if generation was successful, False otherwise. """ try: import pyttsx3 # Initialize TTS engine engine = pyttsx3.init() # Set speech rate engine.setProperty('rate', rate) # Set voice if provided if voice: voices = engine.getProperty('voices') for v in voices: if voice in v.id: engine.setProperty('voice', v.id) break # Generate speech and save to file engine.save_to_file(text, str(output_path)) engine.runAndWait() logger.info(f"[StoryAudioGeneration] Generated audio using pyttsx3: {output_path}") return True except ImportError: logger.error("[StoryAudioGeneration] pyttsx3 not installed. Install with: pip install pyttsx3") return False except Exception as e: logger.error(f"[StoryAudioGeneration] Error generating audio with pyttsx3: {e}") return False def generate_scene_audio( self, scene: Dict[str, Any], user_id: str, provider: str = "gtts", lang: str = "en", slow: bool = False, rate: int = 150, db: Optional[Session] = None ) -> Dict[str, Any]: """ Generate audio narration for a single story scene. Parameters: scene (Dict[str, Any]): Scene data with audio_narration text. user_id (str): Clerk user ID for subscription checking (for future usage tracking). provider (str): TTS provider to use ("gtts", "pyttsx3", etc.). lang (str): Language code for TTS (default: "en"). slow (bool): Whether to speak slowly (default: False, gTTS only). rate (int): Speech rate (default: 150, pyttsx3 only). db (Session, optional): Database session. Returns: Dict[str, Any]: Audio metadata including file path, URL, and scene info. """ scene_number = scene.get("scene_number", 0) scene_title = scene.get("title", "Untitled") audio_narration = scene.get("audio_narration", "") if not audio_narration: raise ValueError(f"Scene {scene_number} ({scene_title}) has no audio_narration") try: logger.info(f"[StoryAudioGeneration] Generating audio for scene {scene_number}: {scene_title}") logger.debug(f"[StoryAudioGeneration] Audio narration: {audio_narration[:100]}...") # Determine output directory (user workspace or default) output_dir = self._get_user_audio_dir(user_id, db) # Generate audio filename audio_filename = self._generate_audio_filename(scene_number, scene_title) audio_path = output_dir / audio_filename # Generate audio based on provider success = False if provider == "gtts": success = self._generate_audio_gtts( text=audio_narration, output_path=audio_path, lang=lang, slow=slow ) elif provider == "pyttsx3": success = self._generate_audio_pyttsx3( text=audio_narration, output_path=audio_path, rate=rate ) else: # Default to gTTS logger.warning(f"[StoryAudioGeneration] Unknown provider '{provider}', using gTTS") success = self._generate_audio_gtts( text=audio_narration, output_path=audio_path, lang=lang, slow=slow ) if not success or not audio_path.exists(): raise RuntimeError(f"Failed to generate audio file: {audio_path}") # Get file size file_size = audio_path.stat().st_size logger.info(f"[StoryAudioGeneration] Saved audio to: {audio_path} ({file_size} bytes)") # Return audio metadata return { "scene_number": scene_number, "scene_title": scene_title, "audio_path": str(audio_path), "audio_filename": audio_filename, "audio_url": f"/api/story/audio/{audio_filename}", # API endpoint to serve audio "provider": provider, "file_size": file_size, } except HTTPException: # Re-raise HTTPExceptions (e.g., 429 subscription limit) raise except Exception as e: logger.error(f"[StoryAudioGeneration] Error generating audio for scene {scene_number}: {e}") raise RuntimeError(f"Failed to generate audio for scene {scene_number}: {str(e)}") from e def generate_scene_audio_list( self, scenes: List[Dict[str, Any]], user_id: str, provider: str = "gtts", lang: str = "en", slow: bool = False, rate: int = 150, progress_callback: Optional[callable] = None, db: Optional[Session] = None ) -> List[Dict[str, Any]]: """ Generate audio narration for multiple story scenes. Parameters: scenes (List[Dict[str, Any]]): List of scene data with audio_narration text. user_id (str): Clerk user ID for subscription checking. provider (str): TTS provider to use ("gtts", "pyttsx3", etc.). lang (str): Language code for TTS (default: "en"). slow (bool): Whether to speak slowly (default: False, gTTS only). rate (int): Speech rate (default: 150, pyttsx3 only). progress_callback (callable, optional): Callback function for progress updates. db (Session, optional): Database session. Returns: List[Dict[str, Any]]: List of audio metadata for each scene. """ if not scenes: raise ValueError("No scenes provided for audio generation") logger.info(f"[StoryAudioGeneration] Generating audio for {len(scenes)} scenes") audio_results = [] total_scenes = len(scenes) for idx, scene in enumerate(scenes): try: # Generate audio for scene audio_result = self.generate_scene_audio( scene=scene, user_id=user_id, provider=provider, lang=lang, slow=slow, rate=rate, db=db ) audio_results.append(audio_result) # Call progress callback if provided if progress_callback: progress = ((idx + 1) / total_scenes) * 100 progress_callback(progress, f"Generated audio for scene {scene.get('scene_number', idx + 1)}") logger.info(f"[StoryAudioGeneration] Generated audio {idx + 1}/{total_scenes}") except Exception as e: logger.error(f"[StoryAudioGeneration] Failed to generate audio for scene {idx + 1}: {e}") # Continue with next scene instead of failing completely # Use empty strings for required fields instead of None audio_results.append({ "scene_number": scene.get("scene_number", idx + 1), "scene_title": scene.get("title", "Untitled"), "audio_filename": "", "audio_url": "", "provider": provider, "file_size": 0, "error": str(e), }) logger.info(f"[StoryAudioGeneration] Generated {len(audio_results)} audio files out of {total_scenes} scenes") return audio_results def generate_ai_audio( self, scene_number: int, scene_title: str, text: str, user_id: str, voice_id: str = "Wise_Woman", custom_voice_id: Optional[str] = None, speed: float = 1.0, volume: float = 1.0, pitch: float = 0.0, emotion: str = "happy", english_normalization: bool = False, sample_rate: Optional[int] = None, bitrate: Optional[int] = None, channel: Optional[str] = None, format: Optional[str] = None, language_boost: Optional[str] = None, enable_sync_mode: Optional[bool] = True, db: Optional[Session] = None, ) -> Dict[str, Any]: """ Generate AI audio for a single scene using main_audio_generation. Parameters: scene_number (int): Scene number. scene_title (str): Scene title. text (str): Text to convert to speech. user_id (str): Clerk user ID for subscription checking. voice_id (str): Voice ID for AI audio generation (default: "Wise_Woman"). speed (float): Speech speed (0.5-2.0, default: 1.0). volume (float): Speech volume (0.1-10.0, default: 1.0). pitch (float): Speech pitch (-12 to 12, default: 0.0). emotion (str): Emotion for speech (default: "happy"). english_normalization (bool): Enable English text normalization for better number reading (default: False). db (Session, optional): Database session. Returns: Dict[str, Any]: Audio metadata including file path, URL, and scene info. """ if not text or not text.strip(): raise ValueError(f"Scene {scene_number} ({scene_title}) requires non-empty text") try: logger.info(f"[StoryAudioGeneration] Generating AI audio for scene {scene_number}: {scene_title}") logger.debug(f"[StoryAudioGeneration] Text length: {len(text)} characters, voice: {voice_id}") # Import main_audio_generation from services.llm_providers.main_audio_generation import generate_audio # Generate audio using main_audio_generation service result = generate_audio( text=text.strip(), voice_id=voice_id, custom_voice_id=custom_voice_id, speed=speed, volume=volume, pitch=pitch, emotion=emotion, user_id=user_id, english_normalization=english_normalization, sample_rate=sample_rate, bitrate=bitrate, channel=channel, format=format, language_boost=language_boost, enable_sync_mode=enable_sync_mode, ) # Use the output_dir that was set when service was created (already handles podcast vs story) output_dir = self.output_dir # Save audio to file audio_filename = self._generate_audio_filename(scene_number, scene_title) audio_path = output_dir / audio_filename with open(audio_path, "wb") as f: f.write(result.audio_bytes) logger.info(f"[StoryAudioGeneration] Saved AI audio to: {audio_path} ({result.file_size} bytes)") # Calculate cost (for response) character_count = result.text_length cost_per_1000_chars = 0.05 cost = (character_count / 1000.0) * cost_per_1000_chars # Return audio metadata return { "scene_number": scene_number, "scene_title": scene_title, "audio_path": str(audio_path), "audio_filename": audio_filename, "audio_url": f"/api/story/audio/{audio_filename}", "provider": result.provider, "model": result.model, "voice_id": result.voice_id, "text_length": result.text_length, "file_size": result.file_size, "cost": cost, } except HTTPException: # Re-raise HTTPExceptions (e.g., 429 subscription limit) raise except Exception as e: logger.error(f"[StoryAudioGeneration] Error generating AI audio for scene {scene_number}: {e}") raise RuntimeError(f"Failed to generate AI audio for scene {scene_number}: {str(e)}") from e