Base code

This commit is contained in:
Kunthawat Greethong
2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions

View File

@@ -0,0 +1,465 @@
"""YouTube Creator scene audio generation handlers."""
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from typing import Dict, Any, Optional
from pydantic import BaseModel
from services.database import get_db
from middleware.auth_middleware import get_current_user, get_current_user_with_query_token
from api.story_writer.utils.auth import require_authenticated_user
from utils.asset_tracker import save_asset_to_library
from models.story_models import StoryAudioResult
from services.story_writer.audio_generation_service import StoryAudioGenerationService
from pathlib import Path
from utils.logger_utils import get_service_logger
router = APIRouter(tags=["youtube-audio"])
logger = get_service_logger("api.youtube.audio")
# Audio output directory
base_dir = Path(__file__).parent.parent.parent.parent
YOUTUBE_AUDIO_DIR = base_dir / "youtube_audio"
YOUTUBE_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
# Initialize audio service
audio_service = StoryAudioGenerationService(output_dir=str(YOUTUBE_AUDIO_DIR))
# WaveSpeed Minimax Speech voice ids include language-specific voices
# Ref: https://wavespeed.ai/docs/docs-api/minimax/minimax_speech_voice_id
LANGUAGE_CODE_TO_LANGUAGE_BOOST = {
"en": "English",
"es": "Spanish",
"fr": "French",
"de": "German",
"pt": "Portuguese",
"it": "Italian",
"hi": "Hindi",
"ar": "Arabic",
"ru": "Russian",
"ja": "Japanese",
"ko": "Korean",
"zh": "Chinese",
"vi": "Vietnamese",
"id": "Indonesian",
"tr": "Turkish",
"nl": "Dutch",
"pl": "Polish",
"th": "Thai",
"uk": "Ukrainian",
"el": "Greek",
"cs": "Czech",
"fi": "Finnish",
"ro": "Romanian",
}
# Default language-specific Minimax voices (first-choice). We keep English on the existing "persona" voices.
LANGUAGE_BOOST_TO_DEFAULT_VOICE_ID = {
"Spanish": "Spanish_male_1_v1",
"French": "French_male_1_v1",
"German": "German_male_1_v1",
"Portuguese": "Portuguese_male_1_v1",
"Italian": "Italian_male_1_v1",
"Hindi": "Hindi_male_1_v1",
"Arabic": "Arabic_male_1_v1",
"Russian": "Russian_male_1_v1",
"Japanese": "Japanese_male_1_v1",
"Korean": "Korean_male_1_v1",
"Chinese": "Chinese_male_1_v1",
"Vietnamese": "Vietnamese_male_1_v1",
"Indonesian": "Indonesian_male_1_v1",
"Turkish": "Turkish_male_1_v1",
"Dutch": "Dutch_male_1_v1",
"Polish": "Polish_male_1_v1",
"Thai": "Thai_male_1_v1",
"Ukrainian": "Ukrainian_male_1_v1",
"Greek": "Greek_male_1_v1",
"Czech": "Czech_male_1_v1",
"Finnish": "Finnish_male_1_v1",
"Romanian": "Romanian_male_1_v1",
}
def _resolve_language_boost(language: Optional[str], explicit_language_boost: Optional[str]) -> str:
"""
Determine the effective WaveSpeed `language_boost`.
- If user explicitly provided language_boost, use it (including "auto").
- Else if language code provided, map to the WaveSpeed boost label.
- Else default to English (backwards compatible).
"""
if explicit_language_boost is not None and str(explicit_language_boost).strip() != "":
return str(explicit_language_boost).strip()
if language is not None and str(language).strip() != "":
lang_code = str(language).strip().lower()
return LANGUAGE_CODE_TO_LANGUAGE_BOOST.get(lang_code, "auto")
return "English"
def select_optimal_emotion(scene_title: str, narration: str, video_plan_context: Optional[Dict[str, Any]] = None) -> str:
"""
Intelligently select the best emotion for YouTube content based on scene analysis.
Available emotions: "happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral"
Returns the selected emotion string.
"""
# Default to happy for engaging YouTube content
selected_emotion = "happy"
scene_text = f"{scene_title} {narration}".lower()
# Hook scenes need excitement and energy
if "hook" in scene_title.lower() or any(word in scene_text for word in ["exciting", "amazing", "unbelievable", "shocking", "wow"]):
selected_emotion = "surprised" # Excited and attention-grabbing
# Emotional stories or inspirational content
elif any(word in scene_text for word in ["emotional", "touching", "heartwarming", "inspiring", "motivational"]):
selected_emotion = "happy" # Warm and uplifting
# Serious or professional content
elif any(word in scene_text for word in ["important", "critical", "serious", "professional", "expert"]):
selected_emotion = "neutral" # Professional and serious
# Problem-solving or tutorial content
elif any(word in scene_text for word in ["problem", "solution", "fix", "help", "guide"]):
selected_emotion = "happy" # Helpful and encouraging
# Call-to-action scenes
elif "cta" in scene_title.lower() or any(word in scene_text for word in ["subscribe", "like", "comment", "share", "action"]):
selected_emotion = "happy" # Confident and encouraging
# Negative or concerning topics
elif any(word in scene_text for word in ["warning", "danger", "risk", "problem", "issue"]):
selected_emotion = "neutral" # Serious but not alarming
# Check video plan context for overall tone
if video_plan_context:
tone = video_plan_context.get("tone", "").lower()
if "serious" in tone or "professional" in tone:
selected_emotion = "neutral"
elif "fun" in tone or "entertaining" in tone:
selected_emotion = "happy"
return selected_emotion
def select_optimal_voice(scene_title: str, narration: str, video_plan_context: Optional[Dict[str, Any]] = None) -> str:
"""
Intelligently select the best voice for YouTube content based on scene analysis.
Analyzes scene title, narration content, and video plan context to choose
the most appropriate voice from available Minimax voices.
Available voices: Wise_Woman, Friendly_Person, Inspirational_girl, Deep_Voice_Man,
Calm_Woman, Casual_Guy, Lively_Girl, Patient_Man, Young_Knight, Determined_Man,
Lovely_Girl, Decent_Boy, Imposing_Manner, Elegant_Man, Abbess, Sweet_Girl_2, Exuberant_Girl
Returns the selected voice_id string.
"""
# Default to Casual_Guy for engaging YouTube content
selected_voice = "Casual_Guy"
# Analyze video plan context for content type
if video_plan_context:
video_type = video_plan_context.get("video_type", "").lower()
target_audience = video_plan_context.get("target_audience", "").lower()
tone = video_plan_context.get("tone", "").lower()
# Educational/Professional content
if any(keyword in video_type for keyword in ["tutorial", "educational", "how-to", "guide", "course"]):
if "professional" in tone or "expert" in target_audience:
selected_voice = "Wise_Woman" # Authoritative and trustworthy
else:
selected_voice = "Patient_Man" # Clear and instructional
# Entertainment/Casual content
elif any(keyword in video_type for keyword in ["entertainment", "vlog", "lifestyle", "story", "review"]):
if "young" in target_audience or "millennial" in target_audience:
selected_voice = "Casual_Guy" # Friendly and relatable
elif "female" in target_audience or "women" in target_audience:
selected_voice = "Lively_Girl" # Energetic and engaging
else:
selected_voice = "Friendly_Person" # Approachable
# Motivational/Inspirational content
elif any(keyword in video_type for keyword in ["motivational", "inspirational", "success", "mindset"]):
selected_voice = "Inspirational_girl" # Uplifting and motivational
# Business/Corporate content
elif any(keyword in video_type for keyword in ["business", "corporate", "finance", "marketing"]):
selected_voice = "Elegant_Man" # Professional and sophisticated
# Tech/Gaming content
elif any(keyword in video_type for keyword in ["tech", "gaming", "software", "app"]):
selected_voice = "Young_Knight" # Energetic and modern
# Analyze scene content for specific voice requirements
scene_text = f"{scene_title} {narration}".lower()
# Hook scenes need energetic, attention-grabbing voices
if "hook" in scene_title.lower() or any(word in scene_text for word in ["attention", "grab", "exciting", "amazing", "unbelievable"]):
selected_voice = "Exuberant_Girl" # Very energetic and enthusiastic
# Emotional/stories need more expressive voices
elif any(word in scene_text for word in ["story", "emotional", "heartwarming", "touching", "inspiring"]):
selected_voice = "Inspirational_girl" # Emotional and inspiring
# Technical explanations need clear, precise voices
elif any(word in scene_text for word in ["technical", "explain", "step-by-step", "process", "how-to"]):
selected_voice = "Calm_Woman" # Clear and methodical
# Call-to-action scenes need confident, persuasive voices
elif "cta" in scene_title.lower() or any(word in scene_text for word in ["subscribe", "like", "comment", "share", "now", "today"]):
selected_voice = "Determined_Man" # Confident and persuasive
logger.info(f"[VoiceSelection] Selected '{selected_voice}' for scene: {scene_title[:50]}...")
return selected_voice
class YouTubeAudioRequest(BaseModel):
scene_id: str
scene_title: str
text: str
voice_id: Optional[str] = None # Will auto-select based on content if not provided
language: Optional[str] = None # Language code for multilingual audio (e.g., "en", "es", "fr")
speed: float = 1.0
volume: float = 1.0
pitch: float = 0.0
emotion: str = "happy" # More engaging for YouTube content
english_normalization: bool = False
# Enhanced defaults for high-quality YouTube audio using Minimax Speech 02 HD
# Higher quality settings for professional YouTube content
sample_rate: Optional[int] = 44100 # CD quality: 44100 Hz (valid values: 8000, 16000, 22050, 24000, 32000, 44100)
bitrate: int = 256000 # Highest quality: 256kbps (valid values: 32000, 64000, 128000, 256000)
channel: Optional[str] = "2" # Stereo for richer audio (valid values: "1" or "2")
format: Optional[str] = "mp3" # Universal format for web
language_boost: Optional[str] = None # If not provided, inferred from `language` (or defaults to English)
enable_sync_mode: bool = True
# Context for intelligent voice/emotion selection
video_plan_context: Optional[Dict[str, Any]] = None # Optional video plan for context-aware voice selection
class YouTubeAudioResponse(BaseModel):
scene_id: str
scene_title: str
audio_filename: str
audio_url: str
provider: str
model: str
voice_id: str
text_length: int
file_size: int
cost: float
@router.post("/audio", response_model=YouTubeAudioResponse)
async def generate_youtube_scene_audio(
request: YouTubeAudioRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
db: Session = Depends(get_db),
):
"""
Generate AI audio for a YouTube scene using shared audio service.
Similar to Podcast's audio generation endpoint.
"""
user_id = require_authenticated_user(current_user)
if not request.text or not request.text.strip():
raise HTTPException(status_code=400, detail="Text is required")
try:
# Preprocess text to remove instructional markers that shouldn't be spoken
# Remove patterns like [Pacing: slow], [Instructions: ...], etc.
import re
processed_text = request.text.strip()
# Remove instructional markers that contain pacing, timing, or other non-spoken content
instructional_patterns = [
r'\[Pacing:\s*[^\]]+\]', # [Pacing: slow]
r'\[Instructions?:\s*[^\]]+\]', # [Instructions: ...]
r'\[Timing:\s*[^\]]+\]', # [Timing: ...]
r'\[Note:\s*[^\]]+\]', # [Note: ...]
r'\[Internal:\s*[^\]]+\]', # [Internal: ...]
]
for pattern in instructional_patterns:
processed_text = re.sub(pattern, '', processed_text, flags=re.IGNORECASE)
# Clean up extra whitespace and normalize
processed_text = re.sub(r'\s+', ' ', processed_text).strip()
if not processed_text:
raise HTTPException(status_code=400, detail="Text became empty after removing instructions. Please provide clean narration text.")
logger.info(f"[YouTubeAudio] Text preprocessing: {len(request.text)} -> {len(processed_text)} characters")
effective_language_boost = _resolve_language_boost(request.language, request.language_boost)
# Intelligent voice and emotion selection based on content analysis
if not request.voice_id:
# If non-English language is selected, default to the language-specific Minimax voice_id.
# Otherwise keep the existing English persona voice selection logic.
if effective_language_boost in LANGUAGE_BOOST_TO_DEFAULT_VOICE_ID and effective_language_boost not in ["English", "auto"]:
selected_voice = LANGUAGE_BOOST_TO_DEFAULT_VOICE_ID[effective_language_boost]
logger.info(
f"[VoiceSelection] Using language-specific default voice '{selected_voice}' "
f"(language_boost={effective_language_boost}, language={request.language})"
)
else:
selected_voice = select_optimal_voice(
request.scene_title,
processed_text,
request.video_plan_context
)
else:
selected_voice = request.voice_id
# Auto-select emotion if not specified or if using defaults
if request.emotion == "happy": # This means it wasn't specifically set by user
selected_emotion = select_optimal_emotion(
request.scene_title,
processed_text,
request.video_plan_context
)
else:
selected_emotion = request.emotion
logger.info(
f"[YouTubeAudio] Voice selection: {selected_voice}, Emotion: {selected_emotion}, "
f"language={request.language}, language_boost={effective_language_boost}"
)
# Build kwargs for optional parameters - use defaults if None
# WaveSpeed API requires specific values, so we provide sensible defaults
# This matches Podcast's approach but with explicit defaults to avoid None errors
optional_kwargs = {}
# DEBUG: Log what values we received
logger.info(
f"[YouTubeAudio] Request parameters: sample_rate={request.sample_rate}, bitrate={request.bitrate}, "
f"channel={request.channel}, format={request.format}, language_boost={request.language_boost}, "
f"effective_language_boost={effective_language_boost}, language={request.language}"
)
# sample_rate: Use provided value or omit (WaveSpeed will use default)
if request.sample_rate is not None:
optional_kwargs["sample_rate"] = request.sample_rate
# bitrate: Always provide a value (default: 128000 = 128kbps)
# Valid values: 32000, 64000, 128000, 256000
# Model already has default of 128000, so request.bitrate will never be None
optional_kwargs["bitrate"] = request.bitrate
# channel: Only include if valid (WaveSpeed only accepts "1" or "2" as strings)
# If None, empty string, or invalid, omit it and WaveSpeed will use default
# NEVER include channel if it's not exactly "1" or "2"
if request.channel is not None and str(request.channel).strip() in ["1", "2"]:
optional_kwargs["channel"] = str(request.channel).strip()
logger.info(f"[YouTubeAudio] Including valid channel: {optional_kwargs['channel']}")
else:
logger.info(f"[YouTubeAudio] Omitting invalid channel: {request.channel}")
# format: Use provided value or omit (WaveSpeed will use default)
if request.format is not None:
optional_kwargs["format"] = request.format
# language_boost: always send resolved value (improves pronunciation and helps multilingual voices)
if effective_language_boost is not None and str(effective_language_boost).strip() != "":
optional_kwargs["language_boost"] = effective_language_boost
logger.info(f"[YouTubeAudio] Final optional_kwargs: {optional_kwargs}")
result: StoryAudioResult = audio_service.generate_ai_audio(
scene_number=0,
scene_title=request.scene_title,
text=processed_text,
user_id=user_id,
voice_id=selected_voice,
speed=request.speed or 1.0,
volume=request.volume or 1.0,
pitch=request.pitch or 0.0,
emotion=selected_emotion,
english_normalization=request.english_normalization or False,
enable_sync_mode=request.enable_sync_mode,
**optional_kwargs,
)
# Override URL to use YouTube endpoint instead of story endpoint
if result.get("audio_url") and "/api/story/audio/" in result.get("audio_url", ""):
audio_filename = result.get("audio_filename", "")
result["audio_url"] = f"/api/youtube/audio/{audio_filename}"
except Exception as exc:
logger.error(f"[YouTube] Audio generation failed: {exc}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Audio generation failed: {exc}")
# Save to asset library (youtube_creator module)
try:
if result.get("audio_url"):
save_asset_to_library(
db=db,
user_id=user_id,
asset_type="audio",
source_module="youtube_creator",
filename=result.get("audio_filename", ""),
file_url=result.get("audio_url", ""),
file_path=result.get("audio_path"),
file_size=result.get("file_size"),
mime_type="audio/mpeg",
title=f"{request.scene_title} - YouTube",
description="YouTube scene narration",
tags=["youtube_creator", "audio", request.scene_id],
provider=result.get("provider"),
model=result.get("model"),
cost=result.get("cost"),
asset_metadata={
"scene_id": request.scene_id,
"scene_title": request.scene_title,
"status": "completed",
},
)
except Exception as e:
logger.warning(f"[YouTube] Failed to save audio asset: {e}")
return YouTubeAudioResponse(
scene_id=request.scene_id,
scene_title=request.scene_title,
audio_filename=result.get("audio_filename", ""),
audio_url=result.get("audio_url", ""),
provider=result.get("provider", "wavespeed"),
model=result.get("model", "minimax/speech-02-hd"),
voice_id=result.get("voice_id", selected_voice),
text_length=result.get("text_length", len(request.text)),
file_size=result.get("file_size", 0),
cost=result.get("cost", 0.0),
)
@router.get("/audio/{filename}")
async def serve_youtube_audio(
filename: str,
current_user: Dict[str, Any] = Depends(get_current_user_with_query_token),
):
"""Serve generated YouTube scene audio files.
Supports authentication via Authorization header or token query parameter.
Query parameter is useful for HTML elements like <audio> that cannot send custom headers.
"""
require_authenticated_user(current_user)
# Security check: ensure filename doesn't contain path traversal
if ".." in filename or "/" in filename or "\\" in filename:
raise HTTPException(status_code=400, detail="Invalid filename")
audio_path = (YOUTUBE_AUDIO_DIR / filename).resolve()
# Security check: ensure path is within YOUTUBE_AUDIO_DIR
if not str(audio_path).startswith(str(YOUTUBE_AUDIO_DIR)):
raise HTTPException(status_code=403, detail="Access denied")
if not audio_path.exists():
raise HTTPException(status_code=404, detail="Audio file not found")
return FileResponse(audio_path, media_type="audio/mpeg")