feat: Add Auto-Dubbing feature for Podcast Maker

This commit adds the Auto-Dubbing feature for Podcast Maker with support
for translating podcast audio to different languages with optional voice
cloning to preserve the original speaker's voice.

New Features:
- Translation Service (common module): DeepL integration for low-cost
  translation, WaveSpeed integration for high-quality translation
- Audio Dubbing Service: STT -> Translate -> TTS pipeline with
  voice cloning support
- 9 new API endpoints for dubbing and voice cloning
- Support for 34+ languages
- Cost estimation utilities
- Comprehensive documentation

Files Added:
- services/translation/ (5 files): Translation service module
- services/dubbing/: Audio dubbing service
- api/podcast/handlers/dubbing.py: API endpoints
- docs/AUTO_DUBBING.md: Feature documentation
- CHANGELOG.md: Change log

Files Modified:
- api/podcast/models.py: Added dubbing request/response models
- api/podcast/router.py: Added dubbing routes
- services/__init__.py: Export translation and dubbing services
- scene_animation.py: Fixed missing Path import
This commit is contained in:
ajaysi
2026-03-24 15:45:51 +05:30
parent 3c58fd555b
commit f503a24b3b
13 changed files with 2448 additions and 3 deletions

View File

@@ -0,0 +1,79 @@
"""
Translation Service for ALwrity.
Provides text translation capabilities using multiple providers:
- DeepL (low-cost, high-quality text translation)
- WaveSpeed (high-quality video/audio dubbing)
This is a COMMON module that can be used across the entire application:
- Podcast Maker: Audio/video dubbing
- Content Creation: Translate blog posts, marketing copy
- AI Writer: Multilingual content generation
- Video Studio: Video translation and subtitles
Usage:
# Simple usage
from services.translation import translate_text, TranslationQuality
result = translate_text("Hello world", target_language="Spanish")
print(result.translated_text)
# Advanced usage
from services.translation import get_translator
translator = get_translator(TranslationQuality.LOW)
result = translator.translate(
text="Your text here",
target_language="fr",
source_language="en"
)
Environment Variables:
DEEPL_API_KEY - DeepL API key for text translation (free tier: 500k chars/month)
DEEPL_USE_PRO - Set to "true" for DeepL Pro account
Examples:
# Translate a single text
>>> from services.translation import translate_text
>>> result = translate_text("Hello", target_language="es")
>>> print(result.translated_text)
Hola
# Batch translation
>>> from services.translation import translate_batch
>>> results = translate_batch(
... texts=["Hello", "Goodbye"],
... target_language="fr"
... )
# Check supported languages
>>> from services.translation import list_supported_languages
>>> langs = list_supported_languages()
>>> print(f"Supports {len(langs)} languages")
"""
from .base_translation import BaseTranslationProvider, TranslationQuality, TranslationResult
from .deepl_translator import DeepLTranslator
from .translation_factory import (
get_translator,
list_supported_languages,
translate_text,
translate_batch,
is_language_supported,
clear_translator_cache,
)
__all__ = [
# Enums and dataclasses
"TranslationQuality",
"TranslationResult",
# Classes
"BaseTranslationProvider",
"DeepLTranslator",
# Factory functions
"get_translator",
"list_supported_languages",
"is_language_supported",
"clear_translator_cache",
# Convenience functions
"translate_text",
"translate_batch",
]

View File

@@ -0,0 +1,210 @@
"""
Base Translation Provider abstract class.
Defines the interface for all translation providers in ALwrity.
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, List, Optional, Any
class TranslationQuality(str, Enum):
LOW = "low"
HIGH = "high"
@dataclass
class TranslationResult:
translated_text: str
source_language: str
target_language: str
provider: str
quality: TranslationQuality
confidence: float = 1.0
alternative_translations: List[str] = field(default_factory=list)
metadata: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
return {
"translated_text": self.translated_text,
"source_language": self.source_language,
"target_language": self.target_language,
"provider": self.provider,
"quality": self.quality.value,
"confidence": self.confidence,
"alternative_translations": self.alternative_translations,
"metadata": self.metadata,
}
class BaseTranslationProvider(ABC):
SUPPORTED_LANGUAGES: Dict[str, str] = {
"en": "English",
"es": "Spanish",
"fr": "French",
"de": "German",
"it": "Italian",
"pt": "Portuguese",
"nl": "Dutch",
"pl": "Polish",
"ru": "Russian",
"ja": "Japanese",
"zh": "Chinese",
"ko": "Korean",
"ar": "Arabic",
"hi": "Hindi",
"tr": "Turkish",
"vi": "Vietnamese",
"th": "Thai",
"id": "Indonesian",
"ms": "Malay",
"fil": "Filipino",
"he": "Hebrew",
"cs": "Czech",
"da": "Danish",
"fi": "Finnish",
"el": "Greek",
"hu": "Hungarian",
"nb": "Norwegian",
"ro": "Romanian",
"sk": "Slovak",
"sv": "Swedish",
"uk": "Ukrainian",
"bg": "Bulgarian",
"hr": "Croatian",
"lt": "Lithuanian",
"lv": "Latvian",
"et": "Estonian",
"sl": "Slovenian",
}
LANGUAGE_CODE_MAPPING: Dict[str, str] = {}
def __init__(self):
self._build_language_mapping()
def _build_language_mapping(self) -> None:
for code, name in self.SUPPORTED_LANGUAGES.items():
self.LANGUAGE_CODE_MAPPING[code.lower()] = code
self.LANGUAGE_CODE_MAPPING[name.lower()] = code
self.LANGUAGE_CODE_MAPPING[name.upper()] = code
def normalize_language_code(self, language: str) -> str:
normalized = language.strip().lower()
if normalized in self.LANGUAGE_CODE_MAPPING:
return self.LANGUAGE_CODE_MAPPING[normalized]
if len(normalized) == 2:
return normalized.upper()
for code, name in self.SUPPORTED_LANGUAGES.items():
if name.lower() == normalized or code.lower() == normalized:
return code
return normalized.upper()
@property
@abstractmethod
def provider_name(self) -> str:
"""Return the name of the translation provider."""
pass
@property
@abstractmethod
def quality(self) -> TranslationQuality:
"""Return the quality tier of this provider."""
pass
@abstractmethod
def translate(
self,
text: str,
target_language: str,
source_language: Optional[str] = None,
) -> TranslationResult:
"""
Translate text to target language.
Args:
text: The text to translate
target_language: Target language code or name
source_language: Source language code or name (auto-detect if None)
Returns:
TranslationResult with translated text and metadata
"""
pass
@abstractmethod
def translate_batch(
self,
texts: List[str],
target_language: str,
source_language: Optional[str] = None,
) -> List[TranslationResult]:
"""
Translate multiple texts in batch.
Args:
texts: List of texts to translate
target_language: Target language code or name
source_language: Source language code or name (auto-detect if None)
Returns:
List of TranslationResults
"""
pass
@abstractmethod
def get_supported_languages(self) -> Dict[str, str]:
"""Return dictionary of supported language codes and names."""
pass
@abstractmethod
def is_language_supported(self, language: str) -> bool:
"""Check if a language is supported."""
pass
@abstractmethod
def calculate_cost(self, text_length: int, char_count: int = 0) -> float:
"""
Calculate the cost for translation.
Args:
text_length: Number of characters to translate
char_count: Optional explicit character count
Returns:
Estimated cost in USD
"""
pass
def validate_text(self, text: str) -> bool:
"""Validate that text is suitable for translation."""
if not text or not text.strip():
return False
if len(text) > 50000:
raise ValueError(f"Text too long: {len(text)} chars. Maximum is 50000.")
return True
def split_long_text(self, text: str, max_chars: int = 5000) -> List[str]:
"""Split long text into manageable chunks."""
if len(text) <= max_chars:
return [text]
chunks = []
sentences = text.replace("! ", ".\n").replace("? ", ".\n").replace("", "\n").split("\n")
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_chars:
current_chunk += sentence + " "
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + " "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks

View File

@@ -0,0 +1,307 @@
"""
DeepL Translation Provider.
Low-cost, high-quality text translation using DeepL API.
Free tier: 500,000 characters/month
API Documentation: https://www.deepl.com/docs-api
"""
import os
from typing import Dict, List, Optional
import httpx
from utils.logger_utils import get_service_logger
from .base_translation import (
BaseTranslationProvider,
TranslationQuality,
TranslationResult,
)
logger = get_service_logger("translation.deepl")
DEEPL_API_URL = "https://api-free.deepl.com/v2/translate"
DEEPL_API_URL_PRO = "https://api.deepl.com/v2/translate"
DEEPL_LANGUAGE_MAPPING: Dict[str, str] = {
"BG": "BG",
"CS": "CS",
"DA": "DA",
"DE": "DE",
"EL": "EL",
"EN": "EN-US",
"EN-GB": "EN-GB",
"EN-US": "EN-US",
"ES": "ES",
"ET": "ET",
"FI": "FI",
"FR": "FR",
"HU": "HU",
"ID": "ID",
"IT": "IT",
"JA": "JA",
"KO": "KO",
"LT": "LT",
"LV": "LV",
"NB": "NB",
"NL": "NL",
"PL": "PL",
"PT": "PT-PT",
"PT-BR": "PT-BR",
"PT-PT": "PT-PT",
"RO": "RO",
"RU": "RU",
"SK": "SK",
"SL": "SL",
"SV": "SV",
"TR": "TR",
"UK": "UK",
"ZH": "ZH",
"ZH-HANS": "ZH-HANS",
"ZH-HANT": "ZH-HANT",
}
DEEPL_SUPPORTED_LANGUAGES: Dict[str, str] = {
"bg": "Bulgarian",
"cs": "Czech",
"da": "Danish",
"de": "German",
"el": "Greek",
"en": "English (American)",
"en-gb": "English (British)",
"es": "Spanish",
"et": "Estonian",
"fi": "Finnish",
"fr": "French",
"hu": "Hungarian",
"id": "Indonesian",
"it": "Italian",
"ja": "Japanese",
"ko": "Korean",
"lt": "Lithuanian",
"lv": "Latvian",
"nb": "Norwegian",
"nl": "Dutch",
"pl": "Polish",
"pt": "Portuguese",
"pt-br": "Portuguese (Brazilian)",
"pt-pt": "Portuguese (European)",
"ro": "Romanian",
"ru": "Russian",
"sk": "Slovak",
"sl": "Slovenian",
"sv": "Swedish",
"tr": "Turkish",
"uk": "Ukrainian",
"zh": "Chinese",
"zh-hans": "Chinese (Simplified)",
"zh-hant": "Chinese (Traditional)",
}
class DeepLTranslator(BaseTranslationProvider):
COST_PER_CHARACTER = 0.00001
def __init__(self, api_key: Optional[str] = None, use_pro: bool = False):
super().__init__()
self._api_key = api_key or os.getenv("DEEPL_API_KEY", "")
self._use_pro = use_pro or os.getenv("DEEPL_USE_PRO", "false").lower() == "true"
if not self._api_key:
logger.warning("DeepL API key not configured. Set DEEPL_API_KEY in environment.")
self._api_url = DEEPL_API_URL_PRO if self._use_pro else DEEPL_API_URL
@property
def provider_name(self) -> str:
return "DeepL"
@property
def quality(self) -> TranslationQuality:
return TranslationQuality.LOW
def _get_deepl_lang_code(self, language: str) -> str:
normalized = self.normalize_language_code(language)
upper = normalized.upper()
if upper in DEEPL_LANGUAGE_MAPPING:
return DEEPL_LANGUAGE_MAPPING[upper]
for deepl_code, lang_name in DEEPL_SUPPORTED_LANGUAGES.items():
if lang_name.lower() == normalized.lower() or deepl_code.lower() == normalized.lower():
return deepl_code.upper() if deepl_code.upper() in DEEPL_LANGUAGE_MAPPING else deepl_code
return upper
def translate(
self,
text: str,
target_language: str,
source_language: Optional[str] = None,
) -> TranslationResult:
self.validate_text(text)
if not self._api_key:
raise ValueError("DeepL API key not configured. Set DEEPL_API_KEY environment variable.")
target_code = self._get_deepl_lang_code(target_language)
source_code = self._get_deepl_lang_code(source_language) if source_language else None
headers = {
"Authorization": f"DeepL-Auth-Key {self._api_key}",
"Content-Type": "application/json",
}
payload = {
"text": [text],
"target_lang": target_code,
}
if source_code:
payload["source_lang"] = source_code
try:
with httpx.Client(timeout=30.0) as client:
response = client.post(self._api_url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
translations = data.get("translations", [])
if not translations:
raise ValueError("No translation returned from DeepL API")
primary = translations[0]
alternatives = [
t["text"] for t in translations[1:] if t.get("text")
]
detected_lang = primary.get("detected_source_language", "")
return TranslationResult(
translated_text=primary["text"],
source_language=detected_lang if not source_language else source_language,
target_language=target_language,
provider=self.provider_name,
quality=self.quality,
confidence=0.95,
alternative_translations=alternatives,
metadata={
"deepl_target_lang": target_code,
"character_count": len(text),
"translations_count": len(translations),
},
)
except httpx.HTTPStatusError as e:
logger.error(f"DeepL API HTTP error: {e.response.status_code} - {e.response.text}")
raise RuntimeError(f"DeepL API error: {e.response.status_code}")
except httpx.RequestError as e:
logger.error(f"DeepL API request error: {str(e)}")
raise RuntimeError(f"DeepL API request failed: {str(e)}")
def translate_batch(
self,
texts: List[str],
target_language: str,
source_language: Optional[str] = None,
) -> List[TranslationResult]:
if not texts:
return []
self.validate_text("\n".join(texts))
if not self._api_key:
raise ValueError("DeepL API key not configured. Set DEEPL_API_KEY environment variable.")
target_code = self._get_deepl_lang_code(target_language)
source_code = self._get_deepl_lang_code(source_language) if source_language else None
headers = {
"Authorization": f"DeepL-Auth-Key {self._api_key}",
"Content-Type": "application/json",
}
payload = {
"text": texts,
"target_lang": target_code,
}
if source_code:
payload["source_lang"] = source_code
try:
with httpx.Client(timeout=60.0) as client:
response = client.post(self._api_url, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
translations = data.get("translations", [])
results = []
detected_source = None
for i, translation in enumerate(translations):
if i == 0:
detected_source = translation.get("detected_source_language", "")
results.append(TranslationResult(
translated_text=translation["text"],
source_language=detected_source or source_language or "auto",
target_language=target_language,
provider=self.provider_name,
quality=self.quality,
confidence=0.95,
metadata={
"deepl_target_lang": target_code,
"batch_size": len(texts),
},
))
return results
except httpx.HTTPStatusError as e:
logger.error(f"DeepL API HTTP error: {e.response.status_code}")
raise RuntimeError(f"DeepL API error: {e.response.status_code}")
except httpx.RequestError as e:
logger.error(f"DeepL API request error: {str(e)}")
raise RuntimeError(f"DeepL API request failed: {str(e)}")
def get_supported_languages(self) -> Dict[str, str]:
return DEEPL_SUPPORTED_LANGUAGES.copy()
def is_language_supported(self, language: str) -> bool:
normalized = self.normalize_language_code(language).lower()
return normalized in DEEPL_SUPPORTED_LANGUAGES
def calculate_cost(self, text_length: int, char_count: int = 0) -> float:
chars = char_count or text_length
return chars * self.COST_PER_CHARACTER
def get_usage_info(self) -> Dict[str, any]:
if not self._api_key:
return {"configured": False, "message": "API key not set"}
usage_url = "https://api-free.deepl.com/v2/usage" if not self._use_pro else "https://api.deepl.com/v2/usage"
headers = {
"Authorization": f"DeepL-Auth-Key {self._api_key}",
}
try:
with httpx.Client(timeout=10.0) as client:
response = client.get(usage_url, headers=headers)
response.raise_for_status()
data = response.json()
return {
"configured": True,
"character_count": data.get("character_count", 0),
"character_limit": data.get("character_limit", 0),
"usage_percent": (data.get("character_count", 0) / data.get("character_limit", 1)) * 100,
}
except Exception as e:
logger.error(f"Failed to get DeepL usage info: {str(e)}")
return {"configured": True, "error": str(e)}

View File

@@ -0,0 +1,172 @@
"""
Translation Factory.
Factory pattern for getting translation providers based on quality tier.
"""
from typing import Dict, Optional
from utils.logger_utils import get_service_logger
from .base_translation import (
BaseTranslationProvider,
TranslationQuality,
TranslationResult,
)
from .deepl_translator import DeepLTranslator
logger = get_service_logger("translation.factory")
_TRANSLATOR_CACHE: Dict[str, BaseTranslationProvider] = {}
def get_translator(
quality: TranslationQuality = TranslationQuality.LOW,
force_new: bool = False,
**kwargs,
) -> BaseTranslationProvider:
"""
Get a translation provider instance based on quality tier.
Args:
quality: The quality tier (LOW or HIGH)
force_new: Force creation of new instance instead of cached
**kwargs: Additional arguments for the provider
Returns:
Translation provider instance
Raises:
ValueError: If quality tier is not supported
"""
global _TRANSLATOR_CACHE
cache_key = f"{quality.value}_{id(kwargs)}"
if not force_new and cache_key in _TRANSLATOR_CACHE:
return _TRANSLATOR_CACHE[cache_key]
if quality == TranslationQuality.LOW:
translator = DeepLTranslator(**kwargs)
logger.info(f"Created DeepL translator (LOW quality)")
elif quality == TranslationQuality.HIGH:
from .wavespeed_translator import WaveSpeedTranslator
translator = WaveSpeedTranslator(**kwargs)
logger.info(f"Created WaveSpeed translator (HIGH quality)")
else:
raise ValueError(f"Unsupported translation quality: {quality}")
_TRANSLATOR_CACHE[cache_key] = translator
return translator
def translate_text(
text: str,
target_language: str,
source_language: Optional[str] = None,
quality: TranslationQuality = TranslationQuality.LOW,
) -> TranslationResult:
"""
Convenience function to translate text.
Args:
text: Text to translate
target_language: Target language code or name
source_language: Source language (auto-detect if None)
quality: Quality tier
Returns:
TranslationResult
"""
translator = get_translator(quality)
return translator.translate(text, target_language, source_language)
def translate_batch(
texts: list[str],
target_language: str,
source_language: Optional[str] = None,
quality: TranslationQuality = TranslationQuality.LOW,
) -> list[TranslationResult]:
"""
Convenience function to translate multiple texts.
Args:
texts: List of texts to translate
target_language: Target language code or name
source_language: Source language (auto-detect if None)
quality: Quality tier
Returns:
List of TranslationResults
"""
translator = get_translator(quality)
return translator.translate_batch(texts, target_language, source_language)
def list_supported_languages(
quality: Optional[TranslationQuality] = None,
) -> Dict[str, str]:
"""
List supported languages.
Args:
quality: Optional quality filter. Returns all if None.
Returns:
Dictionary of language codes to names
"""
if quality == TranslationQuality.LOW:
return DeepLTranslator().get_supported_languages()
elif quality == TranslationQuality.HIGH:
from .wavespeed_translator import WaveSpeedTranslator
return WaveSpeedTranslator().get_supported_languages()
else:
base_langs = DeepLTranslator.SUPPORTED_LANGUAGES
try:
from .wavespeed_translator import WaveSpeedTranslator
wavespeed_langs = WaveSpeedTranslator.SUPPORTED_LANGUAGES
all_langs = {**base_langs, **wavespeed_langs}
return all_langs
except (ImportError, Exception):
return base_langs
def is_language_supported(
language: str,
quality: Optional[TranslationQuality] = None,
) -> bool:
"""
Check if a language is supported.
Args:
language: Language code or name
quality: Optional quality filter
Returns:
True if supported
"""
if quality == TranslationQuality.LOW:
return DeepLTranslator().is_language_supported(language)
elif quality == TranslationQuality.HIGH:
from .wavespeed_translator import WaveSpeedTranslator
return WaveSpeedTranslator().is_language_supported(language)
else:
return (
DeepLTranslator().is_language_supported(language) or
_check_wavespeed_support(language)
)
def _check_wavespeed_support(language: str) -> bool:
try:
from .wavespeed_translator import WaveSpeedTranslator
return WaveSpeedTranslator().is_language_supported(language)
except (ImportError, Exception):
return False
def clear_translator_cache() -> None:
"""Clear the translator cache."""
global _TRANSLATOR_CACHE
_TRANSLATOR_CACHE.clear()
logger.info("Translation provider cache cleared")

View File

@@ -0,0 +1,138 @@
"""
WaveSpeed Translation Provider.
High-quality video/text translation using WaveSpeed API.
This will be used for Phase 3 (High-Quality Dubbing).
API: Uses existing WaveSpeed video translation API.
"""
from typing import Dict, List, Optional
from utils.logger_utils import get_service_logger
from .base_translation import (
BaseTranslationProvider,
TranslationQuality,
TranslationResult,
)
logger = get_service_logger("translation.wavespeed")
WAVESPEED_SUPPORTED_LANGUAGES: Dict[str, str] = {
"en": "English",
"es": "Spanish",
"fr": "French",
"de": "German",
"it": "Italian",
"pt": "Portuguese",
"ja": "Japanese",
"ko": "Korean",
"zh": "Chinese",
"ar": "Arabic",
"hi": "Hindi",
"ru": "Russian",
"nl": "Dutch",
"pl": "Polish",
"tr": "Turkish",
"vi": "Vietnamese",
"th": "Thai",
"id": "Indonesian",
"ms": "Malay",
"fil": "Filipino",
"he": "Hebrew",
"cs": "Czech",
"da": "Danish",
"fi": "Finnish",
"el": "Greek",
"hu": "Hungarian",
"nb": "Norwegian",
"ro": "Romanian",
"sk": "Slovak",
"sv": "Swedish",
"uk": "Ukrainian",
}
class WaveSpeedTranslator(BaseTranslationProvider):
COST_PER_CHARACTER = 0.0001
def __init__(self):
super().__init__()
logger.info("[WaveSpeedTranslator] Initialized (high-quality mode)")
@property
def provider_name(self) -> str:
return "WaveSpeed"
@property
def quality(self) -> TranslationQuality:
return TranslationQuality.HIGH
def translate(
self,
text: str,
target_language: str,
source_language: Optional[str] = None,
) -> TranslationResult:
self.validate_text(text)
raise NotImplementedError(
"WaveSpeed text translation not yet implemented. "
"For high-quality translation, use the video translation API "
"or fall back to DeepL for text translation."
)
def translate_batch(
self,
texts: List[str],
target_language: str,
source_language: Optional[str] = None,
) -> List[TranslationResult]:
raise NotImplementedError(
"WaveSpeed batch translation not yet implemented."
)
def get_supported_languages(self) -> Dict[str, str]:
return WAVESPEED_SUPPORTED_LANGUAGES.copy()
def is_language_supported(self, language: str) -> bool:
normalized = self.normalize_language_code(language).lower()
return normalized in WAVESPEED_SUPPORTED_LANGUAGES
def calculate_cost(self, text_length: int, char_count: int = 0) -> float:
chars = char_count or text_length
return chars * self.COST_PER_CHARACTER
def translate_video(
self,
video_path: str,
target_language: str,
source_language: Optional[str] = None,
) -> bytes:
"""
Translate video using WaveSpeed video translation API.
This is the primary use case for high-quality dubbing.
Args:
video_path: Path to video file
target_language: Target language
source_language: Source language (auto-detect if None)
Returns:
Translated video bytes
"""
from ..wavespeed.generators.video.translation import VideoTranslation
translator = VideoTranslation()
target_lang = self.normalize_language_code(target_language)
with open(video_path, "rb") as f:
video_bytes = f.read()
return translator.video_translate(
video=video_bytes,
output_language=target_lang,
enable_sync_mode=True,
)