466 lines
20 KiB
Python
466 lines
20 KiB
Python
"""YouTube Creator scene audio generation handlers."""
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException
|
|
from fastapi.responses import FileResponse
|
|
from sqlalchemy.orm import Session
|
|
from typing import Dict, Any, Optional
|
|
from pydantic import BaseModel
|
|
|
|
from services.database import get_db
|
|
from middleware.auth_middleware import get_current_user, get_current_user_with_query_token
|
|
from api.story_writer.utils.auth import require_authenticated_user
|
|
from utils.asset_tracker import save_asset_to_library
|
|
from models.story_models import StoryAudioResult
|
|
from services.story_writer.audio_generation_service import StoryAudioGenerationService
|
|
from pathlib import Path
|
|
from utils.logger_utils import get_service_logger
|
|
|
|
router = APIRouter(tags=["youtube-audio"])
|
|
logger = get_service_logger("api.youtube.audio")
|
|
|
|
# Audio output directory
|
|
base_dir = Path(__file__).parent.parent.parent.parent
|
|
YOUTUBE_AUDIO_DIR = base_dir / "youtube_audio"
|
|
YOUTUBE_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Initialize audio service
|
|
audio_service = StoryAudioGenerationService(output_dir=str(YOUTUBE_AUDIO_DIR))
|
|
|
|
# WaveSpeed Minimax Speech voice ids include language-specific voices
|
|
# Ref: https://wavespeed.ai/docs/docs-api/minimax/minimax_speech_voice_id
|
|
LANGUAGE_CODE_TO_LANGUAGE_BOOST = {
|
|
"en": "English",
|
|
"es": "Spanish",
|
|
"fr": "French",
|
|
"de": "German",
|
|
"pt": "Portuguese",
|
|
"it": "Italian",
|
|
"hi": "Hindi",
|
|
"ar": "Arabic",
|
|
"ru": "Russian",
|
|
"ja": "Japanese",
|
|
"ko": "Korean",
|
|
"zh": "Chinese",
|
|
"vi": "Vietnamese",
|
|
"id": "Indonesian",
|
|
"tr": "Turkish",
|
|
"nl": "Dutch",
|
|
"pl": "Polish",
|
|
"th": "Thai",
|
|
"uk": "Ukrainian",
|
|
"el": "Greek",
|
|
"cs": "Czech",
|
|
"fi": "Finnish",
|
|
"ro": "Romanian",
|
|
}
|
|
|
|
# Default language-specific Minimax voices (first-choice). We keep English on the existing "persona" voices.
|
|
LANGUAGE_BOOST_TO_DEFAULT_VOICE_ID = {
|
|
"Spanish": "Spanish_male_1_v1",
|
|
"French": "French_male_1_v1",
|
|
"German": "German_male_1_v1",
|
|
"Portuguese": "Portuguese_male_1_v1",
|
|
"Italian": "Italian_male_1_v1",
|
|
"Hindi": "Hindi_male_1_v1",
|
|
"Arabic": "Arabic_male_1_v1",
|
|
"Russian": "Russian_male_1_v1",
|
|
"Japanese": "Japanese_male_1_v1",
|
|
"Korean": "Korean_male_1_v1",
|
|
"Chinese": "Chinese_male_1_v1",
|
|
"Vietnamese": "Vietnamese_male_1_v1",
|
|
"Indonesian": "Indonesian_male_1_v1",
|
|
"Turkish": "Turkish_male_1_v1",
|
|
"Dutch": "Dutch_male_1_v1",
|
|
"Polish": "Polish_male_1_v1",
|
|
"Thai": "Thai_male_1_v1",
|
|
"Ukrainian": "Ukrainian_male_1_v1",
|
|
"Greek": "Greek_male_1_v1",
|
|
"Czech": "Czech_male_1_v1",
|
|
"Finnish": "Finnish_male_1_v1",
|
|
"Romanian": "Romanian_male_1_v1",
|
|
}
|
|
|
|
|
|
def _resolve_language_boost(language: Optional[str], explicit_language_boost: Optional[str]) -> str:
|
|
"""
|
|
Determine the effective WaveSpeed `language_boost`.
|
|
- If user explicitly provided language_boost, use it (including "auto").
|
|
- Else if language code provided, map to the WaveSpeed boost label.
|
|
- Else default to English (backwards compatible).
|
|
"""
|
|
if explicit_language_boost is not None and str(explicit_language_boost).strip() != "":
|
|
return str(explicit_language_boost).strip()
|
|
|
|
if language is not None and str(language).strip() != "":
|
|
lang_code = str(language).strip().lower()
|
|
return LANGUAGE_CODE_TO_LANGUAGE_BOOST.get(lang_code, "auto")
|
|
|
|
return "English"
|
|
|
|
def select_optimal_emotion(scene_title: str, narration: str, video_plan_context: Optional[Dict[str, Any]] = None) -> str:
|
|
"""
|
|
Intelligently select the best emotion for YouTube content based on scene analysis.
|
|
|
|
Available emotions: "happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral"
|
|
|
|
Returns the selected emotion string.
|
|
"""
|
|
# Default to happy for engaging YouTube content
|
|
selected_emotion = "happy"
|
|
|
|
scene_text = f"{scene_title} {narration}".lower()
|
|
|
|
# Hook scenes need excitement and energy
|
|
if "hook" in scene_title.lower() or any(word in scene_text for word in ["exciting", "amazing", "unbelievable", "shocking", "wow"]):
|
|
selected_emotion = "surprised" # Excited and attention-grabbing
|
|
|
|
# Emotional stories or inspirational content
|
|
elif any(word in scene_text for word in ["emotional", "touching", "heartwarming", "inspiring", "motivational"]):
|
|
selected_emotion = "happy" # Warm and uplifting
|
|
|
|
# Serious or professional content
|
|
elif any(word in scene_text for word in ["important", "critical", "serious", "professional", "expert"]):
|
|
selected_emotion = "neutral" # Professional and serious
|
|
|
|
# Problem-solving or tutorial content
|
|
elif any(word in scene_text for word in ["problem", "solution", "fix", "help", "guide"]):
|
|
selected_emotion = "happy" # Helpful and encouraging
|
|
|
|
# Call-to-action scenes
|
|
elif "cta" in scene_title.lower() or any(word in scene_text for word in ["subscribe", "like", "comment", "share", "action"]):
|
|
selected_emotion = "happy" # Confident and encouraging
|
|
|
|
# Negative or concerning topics
|
|
elif any(word in scene_text for word in ["warning", "danger", "risk", "problem", "issue"]):
|
|
selected_emotion = "neutral" # Serious but not alarming
|
|
|
|
# Check video plan context for overall tone
|
|
if video_plan_context:
|
|
tone = video_plan_context.get("tone", "").lower()
|
|
if "serious" in tone or "professional" in tone:
|
|
selected_emotion = "neutral"
|
|
elif "fun" in tone or "entertaining" in tone:
|
|
selected_emotion = "happy"
|
|
|
|
return selected_emotion
|
|
|
|
|
|
def select_optimal_voice(scene_title: str, narration: str, video_plan_context: Optional[Dict[str, Any]] = None) -> str:
|
|
"""
|
|
Intelligently select the best voice for YouTube content based on scene analysis.
|
|
|
|
Analyzes scene title, narration content, and video plan context to choose
|
|
the most appropriate voice from available Minimax voices.
|
|
|
|
Available voices: Wise_Woman, Friendly_Person, Inspirational_girl, Deep_Voice_Man,
|
|
Calm_Woman, Casual_Guy, Lively_Girl, Patient_Man, Young_Knight, Determined_Man,
|
|
Lovely_Girl, Decent_Boy, Imposing_Manner, Elegant_Man, Abbess, Sweet_Girl_2, Exuberant_Girl
|
|
|
|
Returns the selected voice_id string.
|
|
"""
|
|
# Default to Casual_Guy for engaging YouTube content
|
|
selected_voice = "Casual_Guy"
|
|
|
|
# Analyze video plan context for content type
|
|
if video_plan_context:
|
|
video_type = video_plan_context.get("video_type", "").lower()
|
|
target_audience = video_plan_context.get("target_audience", "").lower()
|
|
tone = video_plan_context.get("tone", "").lower()
|
|
|
|
# Educational/Professional content
|
|
if any(keyword in video_type for keyword in ["tutorial", "educational", "how-to", "guide", "course"]):
|
|
if "professional" in tone or "expert" in target_audience:
|
|
selected_voice = "Wise_Woman" # Authoritative and trustworthy
|
|
else:
|
|
selected_voice = "Patient_Man" # Clear and instructional
|
|
|
|
# Entertainment/Casual content
|
|
elif any(keyword in video_type for keyword in ["entertainment", "vlog", "lifestyle", "story", "review"]):
|
|
if "young" in target_audience or "millennial" in target_audience:
|
|
selected_voice = "Casual_Guy" # Friendly and relatable
|
|
elif "female" in target_audience or "women" in target_audience:
|
|
selected_voice = "Lively_Girl" # Energetic and engaging
|
|
else:
|
|
selected_voice = "Friendly_Person" # Approachable
|
|
|
|
# Motivational/Inspirational content
|
|
elif any(keyword in video_type for keyword in ["motivational", "inspirational", "success", "mindset"]):
|
|
selected_voice = "Inspirational_girl" # Uplifting and motivational
|
|
|
|
# Business/Corporate content
|
|
elif any(keyword in video_type for keyword in ["business", "corporate", "finance", "marketing"]):
|
|
selected_voice = "Elegant_Man" # Professional and sophisticated
|
|
|
|
# Tech/Gaming content
|
|
elif any(keyword in video_type for keyword in ["tech", "gaming", "software", "app"]):
|
|
selected_voice = "Young_Knight" # Energetic and modern
|
|
|
|
# Analyze scene content for specific voice requirements
|
|
scene_text = f"{scene_title} {narration}".lower()
|
|
|
|
# Hook scenes need energetic, attention-grabbing voices
|
|
if "hook" in scene_title.lower() or any(word in scene_text for word in ["attention", "grab", "exciting", "amazing", "unbelievable"]):
|
|
selected_voice = "Exuberant_Girl" # Very energetic and enthusiastic
|
|
|
|
# Emotional/stories need more expressive voices
|
|
elif any(word in scene_text for word in ["story", "emotional", "heartwarming", "touching", "inspiring"]):
|
|
selected_voice = "Inspirational_girl" # Emotional and inspiring
|
|
|
|
# Technical explanations need clear, precise voices
|
|
elif any(word in scene_text for word in ["technical", "explain", "step-by-step", "process", "how-to"]):
|
|
selected_voice = "Calm_Woman" # Clear and methodical
|
|
|
|
# Call-to-action scenes need confident, persuasive voices
|
|
elif "cta" in scene_title.lower() or any(word in scene_text for word in ["subscribe", "like", "comment", "share", "now", "today"]):
|
|
selected_voice = "Determined_Man" # Confident and persuasive
|
|
|
|
logger.info(f"[VoiceSelection] Selected '{selected_voice}' for scene: {scene_title[:50]}...")
|
|
return selected_voice
|
|
|
|
|
|
class YouTubeAudioRequest(BaseModel):
|
|
scene_id: str
|
|
scene_title: str
|
|
text: str
|
|
voice_id: Optional[str] = None # Will auto-select based on content if not provided
|
|
language: Optional[str] = None # Language code for multilingual audio (e.g., "en", "es", "fr")
|
|
speed: float = 1.0
|
|
volume: float = 1.0
|
|
pitch: float = 0.0
|
|
emotion: str = "happy" # More engaging for YouTube content
|
|
english_normalization: bool = False
|
|
# Enhanced defaults for high-quality YouTube audio using Minimax Speech 02 HD
|
|
# Higher quality settings for professional YouTube content
|
|
sample_rate: Optional[int] = 44100 # CD quality: 44100 Hz (valid values: 8000, 16000, 22050, 24000, 32000, 44100)
|
|
bitrate: int = 256000 # Highest quality: 256kbps (valid values: 32000, 64000, 128000, 256000)
|
|
channel: Optional[str] = "2" # Stereo for richer audio (valid values: "1" or "2")
|
|
format: Optional[str] = "mp3" # Universal format for web
|
|
language_boost: Optional[str] = None # If not provided, inferred from `language` (or defaults to English)
|
|
enable_sync_mode: bool = True
|
|
# Context for intelligent voice/emotion selection
|
|
video_plan_context: Optional[Dict[str, Any]] = None # Optional video plan for context-aware voice selection
|
|
|
|
|
|
class YouTubeAudioResponse(BaseModel):
|
|
scene_id: str
|
|
scene_title: str
|
|
audio_filename: str
|
|
audio_url: str
|
|
provider: str
|
|
model: str
|
|
voice_id: str
|
|
text_length: int
|
|
file_size: int
|
|
cost: float
|
|
|
|
|
|
@router.post("/audio", response_model=YouTubeAudioResponse)
|
|
async def generate_youtube_scene_audio(
|
|
request: YouTubeAudioRequest,
|
|
current_user: Dict[str, Any] = Depends(get_current_user),
|
|
db: Session = Depends(get_db),
|
|
):
|
|
"""
|
|
Generate AI audio for a YouTube scene using shared audio service.
|
|
Similar to Podcast's audio generation endpoint.
|
|
"""
|
|
user_id = require_authenticated_user(current_user)
|
|
|
|
if not request.text or not request.text.strip():
|
|
raise HTTPException(status_code=400, detail="Text is required")
|
|
|
|
try:
|
|
# Preprocess text to remove instructional markers that shouldn't be spoken
|
|
# Remove patterns like [Pacing: slow], [Instructions: ...], etc.
|
|
import re
|
|
processed_text = request.text.strip()
|
|
|
|
# Remove instructional markers that contain pacing, timing, or other non-spoken content
|
|
instructional_patterns = [
|
|
r'\[Pacing:\s*[^\]]+\]', # [Pacing: slow]
|
|
r'\[Instructions?:\s*[^\]]+\]', # [Instructions: ...]
|
|
r'\[Timing:\s*[^\]]+\]', # [Timing: ...]
|
|
r'\[Note:\s*[^\]]+\]', # [Note: ...]
|
|
r'\[Internal:\s*[^\]]+\]', # [Internal: ...]
|
|
]
|
|
|
|
for pattern in instructional_patterns:
|
|
processed_text = re.sub(pattern, '', processed_text, flags=re.IGNORECASE)
|
|
|
|
# Clean up extra whitespace and normalize
|
|
processed_text = re.sub(r'\s+', ' ', processed_text).strip()
|
|
|
|
if not processed_text:
|
|
raise HTTPException(status_code=400, detail="Text became empty after removing instructions. Please provide clean narration text.")
|
|
|
|
logger.info(f"[YouTubeAudio] Text preprocessing: {len(request.text)} -> {len(processed_text)} characters")
|
|
|
|
effective_language_boost = _resolve_language_boost(request.language, request.language_boost)
|
|
|
|
# Intelligent voice and emotion selection based on content analysis
|
|
if not request.voice_id:
|
|
# If non-English language is selected, default to the language-specific Minimax voice_id.
|
|
# Otherwise keep the existing English persona voice selection logic.
|
|
if effective_language_boost in LANGUAGE_BOOST_TO_DEFAULT_VOICE_ID and effective_language_boost not in ["English", "auto"]:
|
|
selected_voice = LANGUAGE_BOOST_TO_DEFAULT_VOICE_ID[effective_language_boost]
|
|
logger.info(
|
|
f"[VoiceSelection] Using language-specific default voice '{selected_voice}' "
|
|
f"(language_boost={effective_language_boost}, language={request.language})"
|
|
)
|
|
else:
|
|
selected_voice = select_optimal_voice(
|
|
request.scene_title,
|
|
processed_text,
|
|
request.video_plan_context
|
|
)
|
|
else:
|
|
selected_voice = request.voice_id
|
|
|
|
# Auto-select emotion if not specified or if using defaults
|
|
if request.emotion == "happy": # This means it wasn't specifically set by user
|
|
selected_emotion = select_optimal_emotion(
|
|
request.scene_title,
|
|
processed_text,
|
|
request.video_plan_context
|
|
)
|
|
else:
|
|
selected_emotion = request.emotion
|
|
|
|
logger.info(
|
|
f"[YouTubeAudio] Voice selection: {selected_voice}, Emotion: {selected_emotion}, "
|
|
f"language={request.language}, language_boost={effective_language_boost}"
|
|
)
|
|
|
|
# Build kwargs for optional parameters - use defaults if None
|
|
# WaveSpeed API requires specific values, so we provide sensible defaults
|
|
# This matches Podcast's approach but with explicit defaults to avoid None errors
|
|
optional_kwargs = {}
|
|
|
|
# DEBUG: Log what values we received
|
|
logger.info(
|
|
f"[YouTubeAudio] Request parameters: sample_rate={request.sample_rate}, bitrate={request.bitrate}, "
|
|
f"channel={request.channel}, format={request.format}, language_boost={request.language_boost}, "
|
|
f"effective_language_boost={effective_language_boost}, language={request.language}"
|
|
)
|
|
|
|
# sample_rate: Use provided value or omit (WaveSpeed will use default)
|
|
if request.sample_rate is not None:
|
|
optional_kwargs["sample_rate"] = request.sample_rate
|
|
|
|
# bitrate: Always provide a value (default: 128000 = 128kbps)
|
|
# Valid values: 32000, 64000, 128000, 256000
|
|
# Model already has default of 128000, so request.bitrate will never be None
|
|
optional_kwargs["bitrate"] = request.bitrate
|
|
|
|
# channel: Only include if valid (WaveSpeed only accepts "1" or "2" as strings)
|
|
# If None, empty string, or invalid, omit it and WaveSpeed will use default
|
|
# NEVER include channel if it's not exactly "1" or "2"
|
|
if request.channel is not None and str(request.channel).strip() in ["1", "2"]:
|
|
optional_kwargs["channel"] = str(request.channel).strip()
|
|
logger.info(f"[YouTubeAudio] Including valid channel: {optional_kwargs['channel']}")
|
|
else:
|
|
logger.info(f"[YouTubeAudio] Omitting invalid channel: {request.channel}")
|
|
|
|
# format: Use provided value or omit (WaveSpeed will use default)
|
|
if request.format is not None:
|
|
optional_kwargs["format"] = request.format
|
|
|
|
# language_boost: always send resolved value (improves pronunciation and helps multilingual voices)
|
|
if effective_language_boost is not None and str(effective_language_boost).strip() != "":
|
|
optional_kwargs["language_boost"] = effective_language_boost
|
|
|
|
logger.info(f"[YouTubeAudio] Final optional_kwargs: {optional_kwargs}")
|
|
|
|
result: StoryAudioResult = audio_service.generate_ai_audio(
|
|
scene_number=0,
|
|
scene_title=request.scene_title,
|
|
text=processed_text,
|
|
user_id=user_id,
|
|
voice_id=selected_voice,
|
|
speed=request.speed or 1.0,
|
|
volume=request.volume or 1.0,
|
|
pitch=request.pitch or 0.0,
|
|
emotion=selected_emotion,
|
|
english_normalization=request.english_normalization or False,
|
|
enable_sync_mode=request.enable_sync_mode,
|
|
**optional_kwargs,
|
|
)
|
|
|
|
# Override URL to use YouTube endpoint instead of story endpoint
|
|
if result.get("audio_url") and "/api/story/audio/" in result.get("audio_url", ""):
|
|
audio_filename = result.get("audio_filename", "")
|
|
result["audio_url"] = f"/api/youtube/audio/{audio_filename}"
|
|
except Exception as exc:
|
|
logger.error(f"[YouTube] Audio generation failed: {exc}", exc_info=True)
|
|
raise HTTPException(status_code=500, detail=f"Audio generation failed: {exc}")
|
|
|
|
# Save to asset library (youtube_creator module)
|
|
try:
|
|
if result.get("audio_url"):
|
|
save_asset_to_library(
|
|
db=db,
|
|
user_id=user_id,
|
|
asset_type="audio",
|
|
source_module="youtube_creator",
|
|
filename=result.get("audio_filename", ""),
|
|
file_url=result.get("audio_url", ""),
|
|
file_path=result.get("audio_path"),
|
|
file_size=result.get("file_size"),
|
|
mime_type="audio/mpeg",
|
|
title=f"{request.scene_title} - YouTube",
|
|
description="YouTube scene narration",
|
|
tags=["youtube_creator", "audio", request.scene_id],
|
|
provider=result.get("provider"),
|
|
model=result.get("model"),
|
|
cost=result.get("cost"),
|
|
asset_metadata={
|
|
"scene_id": request.scene_id,
|
|
"scene_title": request.scene_title,
|
|
"status": "completed",
|
|
},
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"[YouTube] Failed to save audio asset: {e}")
|
|
|
|
return YouTubeAudioResponse(
|
|
scene_id=request.scene_id,
|
|
scene_title=request.scene_title,
|
|
audio_filename=result.get("audio_filename", ""),
|
|
audio_url=result.get("audio_url", ""),
|
|
provider=result.get("provider", "wavespeed"),
|
|
model=result.get("model", "minimax/speech-02-hd"),
|
|
voice_id=result.get("voice_id", selected_voice),
|
|
text_length=result.get("text_length", len(request.text)),
|
|
file_size=result.get("file_size", 0),
|
|
cost=result.get("cost", 0.0),
|
|
)
|
|
|
|
|
|
@router.get("/audio/{filename}")
|
|
async def serve_youtube_audio(
|
|
filename: str,
|
|
current_user: Dict[str, Any] = Depends(get_current_user_with_query_token),
|
|
):
|
|
"""Serve generated YouTube scene audio files.
|
|
|
|
Supports authentication via Authorization header or token query parameter.
|
|
Query parameter is useful for HTML elements like <audio> that cannot send custom headers.
|
|
"""
|
|
require_authenticated_user(current_user)
|
|
|
|
# Security check: ensure filename doesn't contain path traversal
|
|
if ".." in filename or "/" in filename or "\\" in filename:
|
|
raise HTTPException(status_code=400, detail="Invalid filename")
|
|
|
|
audio_path = (YOUTUBE_AUDIO_DIR / filename).resolve()
|
|
|
|
# Security check: ensure path is within YOUTUBE_AUDIO_DIR
|
|
if not str(audio_path).startswith(str(YOUTUBE_AUDIO_DIR)):
|
|
raise HTTPException(status_code=403, detail="Access denied")
|
|
|
|
if not audio_path.exists():
|
|
raise HTTPException(status_code=404, detail="Audio file not found")
|
|
|
|
return FileResponse(audio_path, media_type="audio/mpeg")
|
|
|