feat: Add Auto-Dubbing feature for Podcast Maker

This commit adds the Auto-Dubbing feature for Podcast Maker with support
for translating podcast audio to different languages with optional voice
cloning to preserve the original speaker's voice.

New Features:
- Translation Service (common module): DeepL integration for low-cost
  translation, WaveSpeed integration for high-quality translation
- Audio Dubbing Service: STT -> Translate -> TTS pipeline with
  voice cloning support
- 9 new API endpoints for dubbing and voice cloning
- Support for 34+ languages
- Cost estimation utilities
- Comprehensive documentation

Files Added:
- services/translation/ (5 files): Translation service module
- services/dubbing/: Audio dubbing service
- api/podcast/handlers/dubbing.py: API endpoints
- docs/AUTO_DUBBING.md: Feature documentation
- CHANGELOG.md: Change log

Files Modified:
- api/podcast/models.py: Added dubbing request/response models
- api/podcast/router.py: Added dubbing routes
- services/__init__.py: Export translation and dubbing services
- scene_animation.py: Fixed missing Path import
This commit is contained in:
ajaysi
2026-03-24 15:45:51 +05:30
parent 3c58fd555b
commit f503a24b3b
13 changed files with 2448 additions and 3 deletions

View File

@@ -0,0 +1,493 @@
"""
Podcast Dubbing Handlers
Audio dubbing endpoints for translating podcast audio to different languages.
Supports both low-quality (DeepL) and high-quality (WaveSpeed) dubbing with voice cloning.
"""
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from typing import Dict, Any, Optional
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from services.database import get_db
from middleware.auth_middleware import get_current_user
from api.story_writer.utils.auth import require_authenticated_user
from api.story_writer.task_manager import task_manager
from loguru import logger
from ..models import (
PodcastAudioDubRequest,
PodcastAudioDubResponse,
PodcastAudioDubResult,
PodcastAudioDubEstimateRequest,
PodcastAudioDubEstimateResponse,
VoiceCloneRequest,
VoiceCloneResponse,
VoiceCloneResult,
)
from services.dubbing import AudioDubbingService
router = APIRouter()
_dubbing_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="podcast_dubbing")
DUBBED_AUDIO_DIR = Path(__file__).resolve().parents[3] / "data" / "media" / "dubbed_audio"
def _ensure_dubbed_audio_dir():
DUBBED_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
def _execute_dubbing_task(
task_id: str,
source_audio_url: str,
source_language: Optional[str],
target_language: str,
quality: str,
voice_id: str,
speed: float,
emotion: str,
use_voice_clone: bool,
custom_voice_id: Optional[str],
voice_clone_accuracy: float,
user_id: str,
):
"""Background task to dub audio."""
try:
task_manager.update_task_status(
task_id, "processing", progress=5.0,
message="Starting audio dubbing..."
)
_ensure_dubbed_audio_dir()
service = AudioDubbingService(output_dir=DUBBED_AUDIO_DIR)
def progress_callback(progress: float, message: str):
task_manager.update_task_status(
task_id, "processing", progress=progress,
message=message
)
logger.info(f"[Dubbing] Task {task_id}: Starting dubbing with voice_clone={use_voice_clone}")
result = service.dub_audio(
source_audio=source_audio_url,
target_language=target_language,
source_language=source_language,
voice_id=voice_id,
speed=speed,
emotion=emotion,
quality=quality,
use_voice_clone=use_voice_clone,
custom_voice_id=custom_voice_id,
accuracy=voice_clone_accuracy,
user_id=user_id,
progress_callback=progress_callback,
)
task_manager.update_task_status(
task_id, "completed", progress=100.0,
result={
"dubbed_audio_url": result.dubbed_audio_url,
"dubbed_audio_filename": Path(result.dubbed_audio_path).name,
"original_transcript": result.original_transcript,
"translated_transcript": result.translated_transcript,
"source_language": result.source_language,
"target_language": result.target_language,
"voice_id": result.voice_id,
"quality": result.quality,
"duration_seconds": result.duration_seconds,
"file_size": result.file_size,
"cost": result.cost,
"status": "completed",
"voice_clone_used": result.voice_clone_used,
"cloned_voice_id": result.cloned_voice_id,
},
message="Audio dubbing completed!"
)
logger.info(f"[Dubbing] Task {task_id} completed successfully (voice_clone_used={result.voice_clone_used})")
except Exception as e:
logger.error(f"[Dubbing] Task {task_id} failed: {str(e)}")
task_manager.update_task_status(
task_id, "failed",
error=str(e),
message=f"Dubbing failed: {str(e)}"
)
def _execute_voice_clone_task(
task_id: str,
source_audio_url: str,
custom_voice_id: Optional[str],
accuracy: float,
language_boost: Optional[str],
user_id: str,
):
"""Background task to clone voice from audio."""
try:
task_manager.update_task_status(
task_id, "processing", progress=10.0,
message="Starting voice cloning..."
)
_ensure_dubbed_audio_dir()
service = AudioDubbingService(output_dir=DUBBED_AUDIO_DIR)
task_manager.update_task_status(
task_id, "processing", progress=30.0,
message="Processing audio..."
)
voice_info = service.clone_voice_from_audio(
source_audio=source_audio_url,
custom_voice_id=custom_voice_id,
accuracy=accuracy,
language_boost=language_boost,
user_id=user_id,
)
task_manager.update_task_status(
task_id, "completed", progress=100.0,
result={
"voice_id": voice_info.voice_id,
"voice_url": voice_info.voice_url,
"source_language": voice_info.source_language,
"accuracy": voice_info.accuracy,
"file_size": voice_info.file_size,
"status": "completed",
},
message="Voice cloning completed!"
)
logger.info(f"[VoiceClone] Task {task_id} completed: {voice_info.voice_id}")
except Exception as e:
logger.error(f"[VoiceClone] Task {task_id} failed: {str(e)}")
task_manager.update_task_status(
task_id, "failed",
error=str(e),
message=f"Voice cloning failed: {str(e)}"
)
@router.post("/dub/audio", response_model=PodcastAudioDubResponse)
async def create_audio_dubbing_task(
request: PodcastAudioDubRequest,
background_tasks: BackgroundTasks,
current_user: Dict[str, Any] = Depends(get_current_user),
db: Session = Depends(get_db),
):
"""
Create an audio dubbing task.
Translates podcast audio to a target language using STT → Translate → TTS pipeline.
For high-quality dubbing with voice preservation, set use_voice_clone=True.
- **source_audio_url**: URL or path to source audio file
- **target_language**: Target language code (e.g., 'es', 'Spanish')
- **source_language**: Source language (auto-detected if not provided)
- **quality**: 'low' (DeepL, cheaper) or 'high' (WaveSpeed, better quality)
- **voice_id**: Voice ID for TTS (default: 'Wise_Woman')
- **speed**: Speech speed 0.5-2.0 (default: 1.0)
- **use_voice_clone**: Use voice cloning to preserve original speaker's voice
- **custom_voice_id**: Custom name for the cloned voice
- **voice_clone_accuracy**: Voice cloning accuracy 0.1-1.0 (default: 0.7)
"""
user_id = require_authenticated_user(current_user)
task_id = task_manager.create_task("audio_dubbing")
background_tasks.add_task(
_execute_dubbing_task,
task_id=task_id,
source_audio_url=request.source_audio_url,
source_language=request.source_language,
target_language=request.target_language,
quality=request.quality,
voice_id=request.voice_id or "Wise_Woman",
speed=request.speed or 1.0,
emotion=request.emotion or "happy",
use_voice_clone=request.use_voice_clone or False,
custom_voice_id=request.custom_voice_id,
voice_clone_accuracy=request.voice_clone_accuracy or 0.7,
user_id=user_id,
)
logger.info(f"[Dubbing] Created task {task_id} for user {user_id} (voice_clone={request.use_voice_clone})")
return PodcastAudioDubResponse(
task_id=task_id,
status="pending",
message="Audio dubbing task created"
)
@router.get("/dub/{task_id}/result", response_model=PodcastAudioDubResult)
async def get_dubbing_result(
task_id: str,
current_user: Dict[str, Any] = Depends(get_current_user),
):
"""
Get the result of a completed dubbing task.
"""
user_id = require_authenticated_user(current_user)
task_status = task_manager.get_task_status(task_id)
if not task_status:
raise HTTPException(status_code=404, detail="Task not found")
if task_status.get("status") == "failed":
raise HTTPException(
status_code=500,
detail=task_status.get("error", "Dubbing failed")
)
if task_status.get("status") != "completed":
return PodcastAudioDubResult(
task_id=task_id,
status=task_status.get("status", "pending"),
dubbed_audio_url="",
dubbed_audio_filename="",
original_transcript="",
translated_transcript="",
source_language="",
target_language="",
voice_id="",
quality="",
duration_seconds=0,
file_size=0,
cost=0.0,
voice_clone_used=False,
cloned_voice_id=None,
)
result_data = task_status.get("result", {})
return PodcastAudioDubResult(
task_id=task_id,
status="completed",
dubbed_audio_url=result_data.get("dubbed_audio_url", ""),
dubbed_audio_filename=result_data.get("dubbed_audio_filename", ""),
original_transcript=result_data.get("original_transcript", ""),
translated_transcript=result_data.get("translated_transcript", ""),
source_language=result_data.get("source_language", ""),
target_language=result_data.get("target_language", ""),
voice_id=result_data.get("voice_id", ""),
quality=result_data.get("quality", ""),
duration_seconds=result_data.get("duration_seconds", 0),
file_size=result_data.get("file_size", 0),
cost=result_data.get("cost", 0.0),
voice_clone_used=result_data.get("voice_clone_used", False),
cloned_voice_id=result_data.get("cloned_voice_id"),
)
@router.get("/dub/audio/{filename}")
async def serve_dubbed_audio(
filename: str,
current_user: Dict[str, Any] = Depends(get_current_user),
):
"""
Serve a dubbed audio file.
"""
user_id = require_authenticated_user(current_user)
_ensure_dubbed_audio_dir()
audio_path = DUBBED_AUDIO_DIR / filename
if not audio_path.exists():
raise HTTPException(status_code=404, detail="Audio file not found")
return FileResponse(
path=audio_path,
media_type="audio/mpeg",
filename=filename,
)
@router.post("/dub/estimate", response_model=PodcastAudioDubEstimateResponse)
async def estimate_dubbing_cost(
request: PodcastAudioDubEstimateRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
):
"""
Estimate the cost for audio dubbing.
Set use_voice_clone=True to include voice cloning cost ($0.05).
"""
user_id = require_authenticated_user(current_user)
service = AudioDubbingService(output_dir=DUBBED_AUDIO_DIR)
cost_estimate = service.estimate_cost(
audio_duration_seconds=request.audio_duration_seconds,
target_language=request.target_language,
quality=request.quality,
use_voice_clone=request.use_voice_clone or False,
)
return PodcastAudioDubEstimateResponse(**cost_estimate)
@router.get("/dub/languages")
async def get_supported_dubbing_languages(
current_user: Dict[str, Any] = Depends(get_current_user),
):
"""
Get list of supported languages for dubbing.
"""
from services.translation import list_supported_languages
languages = list_supported_languages()
return {
"languages": [
{"code": code, "name": name}
for code, name in sorted(languages.items(), key=lambda x: x[1])
],
"count": len(languages),
}
@router.get("/dub/voices")
async def get_available_voices(
current_user: Dict[str, Any] = Depends(get_current_user),
):
"""
Get list of available TTS voices for dubbing.
"""
return {
"voices": [
{"id": "Wise_Woman", "name": "Wise Woman", "gender": "female"},
{"id": "Warm_Woman", "name": "Warm Woman", "gender": "female"},
{"id": "Young_Woman", "name": "Young Woman", "gender": "female"},
{"id": "Mature_Woman", "name": "Mature Woman", "gender": "female"},
{"id": "Gentle_Woman", "name": "Gentle Woman", "gender": "female"},
{"id": "Confident_Man", "name": "Confident Man", "gender": "male"},
{"id": "Warm_Man", "name": "Warm Man", "gender": "male"},
{"id": "Young_Man", "name": "Young Man", "gender": "male"},
{"id": "Mature_Man", "name": "Mature Man", "gender": "male"},
{"id": "Default", "name": "Default", "gender": "neutral"},
],
"count": 10,
"note": "Voice cloning creates custom voices from audio samples. Use /dub/voices/clone to create one."
}
@router.post("/dub/voices/clone", response_model=VoiceCloneResponse)
async def create_voice_clone_task(
request: VoiceCloneRequest,
background_tasks: BackgroundTasks,
current_user: Dict[str, Any] = Depends(get_current_user),
db: Session = Depends(get_db),
):
"""
Clone a voice from an audio sample.
Creates a custom voice that can be used for dubbing with preserved speaker identity.
- **source_audio_url**: URL or path to source audio (10-60 seconds recommended)
- **custom_voice_id**: Custom name for the cloned voice
- **accuracy**: Cloning accuracy 0.1-1.0 (higher = better quality but more processing)
- **language_boost**: Language to optimize the voice for
"""
user_id = require_authenticated_user(current_user)
task_id = task_manager.create_task("voice_clone")
background_tasks.add_task(
_execute_voice_clone_task,
task_id=task_id,
source_audio_url=request.source_audio_url,
custom_voice_id=request.custom_voice_id,
accuracy=request.accuracy or 0.7,
language_boost=request.language_boost,
user_id=user_id,
)
logger.info(f"[VoiceClone] Created task {task_id} for user {user_id}")
return VoiceCloneResponse(
task_id=task_id,
status="pending",
message="Voice cloning task created"
)
@router.get("/dub/voices/{task_id}/result", response_model=VoiceCloneResult)
async def get_voice_clone_result(
task_id: str,
current_user: Dict[str, Any] = Depends(get_current_user),
):
"""
Get the result of a completed voice cloning task.
"""
user_id = require_authenticated_user(current_user)
task_status = task_manager.get_task_status(task_id)
if not task_status:
raise HTTPException(status_code=404, detail="Task not found")
if task_status.get("status") == "failed":
raise HTTPException(
status_code=500,
detail=task_status.get("error", "Voice cloning failed")
)
if task_status.get("status") != "completed":
return VoiceCloneResult(
task_id=task_id,
voice_id="",
voice_url="",
source_language="",
accuracy=0.0,
file_size=0,
status=task_status.get("status", "pending"),
)
result_data = task_status.get("result", {})
return VoiceCloneResult(
task_id=task_id,
voice_id=result_data.get("voice_id", ""),
voice_url=result_data.get("voice_url", ""),
source_language=result_data.get("source_language", ""),
accuracy=result_data.get("accuracy", 0.7),
file_size=result_data.get("file_size", 0),
status="completed",
)
@router.get("/dub/voices/audio/{filename}")
async def serve_voice_audio(
filename: str,
current_user: Dict[str, Any] = Depends(get_current_user),
):
"""
Serve a voice sample audio file.
"""
user_id = require_authenticated_user(current_user)
_ensure_dubbed_audio_dir()
audio_path = DUBBED_AUDIO_DIR / filename
if not audio_path.exists():
raise HTTPException(status_code=404, detail="Voice audio file not found")
return FileResponse(
path=audio_path,
media_type="audio/mpeg",
filename=filename,
)

View File

@@ -7,6 +7,7 @@ All Pydantic request/response models for podcast endpoints.
from pydantic import BaseModel, Field, model_validator
from typing import List, Optional, Dict, Any
from datetime import datetime
from enum import Enum
class PodcastProjectResponse(BaseModel):
@@ -320,3 +321,99 @@ class PodcastCombineVideosResponse(BaseModel):
status: str
message: str
class AudioDubbingQuality(str, Enum):
LOW = "low"
HIGH = "high"
@classmethod
def from_string(cls, value: str) -> "AudioDubbingQuality":
if value.lower() == "high":
return cls.HIGH
return cls.LOW
class PodcastAudioDubRequest(BaseModel):
"""Request model for audio dubbing."""
source_audio_url: str = Field(..., description="URL or path to source audio file")
source_language: Optional[str] = Field(None, description="Source language code (auto-detected if None)")
target_language: str = Field(..., description="Target language for dubbing")
quality: str = Field(default="low", description="Translation quality: low (DeepL) or high (WaveSpeed)")
voice_id: Optional[str] = Field(default="Wise_Woman", description="Voice ID for TTS")
speed: Optional[float] = Field(default=1.0, ge=0.5, le=2.0, description="Speech speed (0.5-2.0)")
emotion: Optional[str] = Field(default="happy", description="Emotion for TTS voice")
preserve_emotion: Optional[bool] = Field(default=True, description="Preserve emotional tone in translation")
use_voice_clone: Optional[bool] = Field(default=False, description="Use voice cloning to preserve original speaker's voice")
custom_voice_id: Optional[str] = Field(None, description="Custom name for the cloned voice")
voice_clone_accuracy: Optional[float] = Field(default=0.7, ge=0.1, le=1.0, description="Voice cloning accuracy (0.1-1.0)")
class PodcastAudioDubResponse(BaseModel):
"""Response model for audio dubbing task creation."""
task_id: str
status: str = "pending"
message: str = "Audio dubbing task created"
class PodcastAudioDubResult(BaseModel):
"""Response model for completed audio dubbing."""
dubbed_audio_url: str
dubbed_audio_filename: str
original_transcript: str
translated_transcript: str
source_language: str
target_language: str
voice_id: str
quality: str
duration_seconds: int
file_size: int
cost: float
task_id: str
status: str = "completed"
voice_clone_used: Optional[bool] = Field(default=False, description="Whether voice cloning was used")
cloned_voice_id: Optional[str] = Field(None, description="ID of the cloned voice if voice_clone_used=True")
class PodcastAudioDubEstimateRequest(BaseModel):
"""Request model for dubbing cost estimation."""
audio_duration_seconds: float = Field(..., description="Duration of source audio in seconds")
target_language: str = Field(..., description="Target language")
quality: str = Field(default="low", description="Translation quality")
use_voice_clone: Optional[bool] = Field(default=False, description="Include voice cloning cost")
class PodcastAudioDubEstimateResponse(BaseModel):
"""Response model for dubbing cost estimation."""
estimated_characters: int
translation_cost: float
tts_cost: float
voice_clone_cost: float = 0.0
total_cost: float
currency: str = "USD"
class VoiceCloneRequest(BaseModel):
"""Request model for voice cloning."""
source_audio_url: str = Field(..., description="URL or path to source audio file (10-60 seconds recommended)")
custom_voice_id: Optional[str] = Field(None, description="Custom name for the cloned voice")
accuracy: Optional[float] = Field(default=0.7, ge=0.1, le=1.0, description="Cloning accuracy (0.1-1.0)")
language_boost: Optional[str] = Field(None, description="Language to optimize the voice for")
class VoiceCloneResponse(BaseModel):
"""Response model for voice cloning."""
task_id: str
status: str = "pending"
message: str = "Voice cloning task created"
class VoiceCloneResult(BaseModel):
"""Response model for completed voice cloning."""
voice_id: str
voice_url: str
source_language: str
accuracy: float
file_size: int
task_id: str
status: str = "completed"

View File

@@ -12,7 +12,7 @@ from api.story_writer.utils.auth import require_authenticated_user
from api.story_writer.task_manager import task_manager
# Import all handler routers
from .handlers import projects, analysis, research, script, audio, images, video, avatar
from .handlers import projects, analysis, research, script, audio, images, video, avatar, dubbing
# Create main router
router = APIRouter(prefix="/api/podcast", tags=["Podcast Maker"])
@@ -26,6 +26,7 @@ router.include_router(audio.router)
router.include_router(images.router)
router.include_router(video.router)
router.include_router(avatar.router)
router.include_router(dubbing.router)
@router.get("/task/{task_id}/status")

View File

@@ -5,6 +5,7 @@ Handles scene animation endpoints using WaveSpeed Kling and InfiniteTalk.
"""
import mimetypes
from pathlib import Path
from typing import Any, Dict, Optional
from urllib.parse import quote