AI story writer enhancements, text to video and voice generation, subscription management, and more.

This commit is contained in:
ajaysi
2025-11-19 09:55:32 +05:30
parent bf7493c366
commit e96525347b
64 changed files with 10367 additions and 400 deletions

View File

@@ -134,6 +134,12 @@ def generate(
current_video_calls = getattr(summary, "video_calls", 0) or 0
video_limit = limits['limits'].get("video_calls", 0) if limits else 0
# Get audio stats for unified log
current_audio_calls = getattr(summary, "audio_calls", 0) or 0
audio_limit = limits['limits'].get("audio_calls", 0) if limits else 0
# Only show ∞ for Enterprise tier when limit is 0 (unlimited)
audio_limit_display = audio_limit if (audio_limit > 0 or tier != 'enterprise') else ''
db_track.commit()
logger.info(f"[images.generate] ✅ Successfully tracked usage: user {user_id} -> stability -> {new_calls} calls")
@@ -148,6 +154,7 @@ def generate(
├─ Calls: {current_calls_before}{new_calls} / {call_limit if call_limit > 0 else ''}
├─ Image Editing: {current_image_edit_calls} / {image_edit_limit if image_edit_limit > 0 else ''}
├─ Videos: {current_video_calls} / {video_limit if video_limit > 0 else ''}
├─ Audio: {current_audio_calls} / {audio_limit_display}
└─ Status: ✅ Allowed & Tracked
""")
except Exception as track_error:
@@ -437,6 +444,12 @@ def edit(
current_video_calls = getattr(summary, "video_calls", 0) or 0
video_limit = limits['limits'].get("video_calls", 0) if limits else 0
# Get audio stats for unified log
current_audio_calls = getattr(summary, "audio_calls", 0) or 0
audio_limit = limits['limits'].get("audio_calls", 0) if limits else 0
# Only show ∞ for Enterprise tier when limit is 0 (unlimited)
audio_limit_display = audio_limit if (audio_limit > 0 or tier != 'enterprise') else ''
db_track.commit()
logger.info(f"[images.edit] ✅ Successfully tracked usage: user {user_id} -> image_edit -> {new_calls} calls")
@@ -451,6 +464,7 @@ def edit(
├─ Calls: {current_calls_before}{new_calls} / {call_limit if call_limit > 0 else ''}
├─ Images: {current_image_gen_calls} / {image_gen_limit if image_gen_limit > 0 else ''}
├─ Videos: {current_video_calls} / {video_limit if video_limit > 0 else ''}
├─ Audio: {current_audio_calls} / {audio_limit_display}
└─ Status: ✅ Allowed & Tracked
""")
except Exception as track_error:

View File

@@ -5,12 +5,19 @@ Main router for story generation operations including premise, outline,
content generation, and full story creation.
"""
from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
from typing import Any, Dict, Union, List, Optional
import mimetypes
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request
from loguru import logger
from middleware.auth_middleware import get_current_user
from middleware.auth_middleware import get_current_user, get_current_user_with_query_token
from models.story_models import (
AnimateSceneRequest,
AnimateSceneVoiceoverRequest,
AnimateSceneResponse,
ResumeSceneAnimationRequest,
StoryGenerationRequest,
StorySetupGenerationRequest,
StorySetupGenerationResponse,
@@ -34,24 +41,66 @@ from models.story_models import (
StoryVideoResult,
TaskStatus,
)
from pydantic import BaseModel, Field
from services.database import get_db
from services.llm_providers.main_video_generation import track_video_usage
from services.story_writer.story_service import StoryWriterService
from .task_manager import task_manager
from .cache_manager import cache_manager
from services.story_writer.video_generation_service import StoryVideoGenerationService
from services.subscription import PricingService
from services.subscription.preflight_validator import validate_scene_animation_operation
from services.wavespeed.kling_animation import animate_scene_image, resume_scene_animation
from services.wavespeed.infinitetalk import animate_scene_with_voiceover
from uuid import uuid4
from pydantic import BaseModel
from pathlib import Path
from utils.logger_utils import get_service_logger
from .cache_manager import cache_manager
from .routes import cache_routes, media_generation, story_content, story_setup, story_tasks, video_generation
from .task_manager import task_manager
from .utils.auth import require_authenticated_user
from .utils.media_utils import resolve_media_file
from .utils.hd_video import (
generate_hd_video_payload,
generate_hd_video_scene_payload,
)
from .utils.hd_video import generate_hd_video_payload, generate_hd_video_scene_payload
from .utils.media_utils import load_story_image_bytes, load_story_audio_bytes, resolve_media_file
from urllib.parse import quote
router = APIRouter(prefix="/api/story", tags=["Story Writer"])
# Include modular routers (order preserved roughly by workflow)
router.include_router(story_setup.router)
router.include_router(story_content.router)
router.include_router(story_tasks.router)
router.include_router(media_generation.router)
router.include_router(video_generation.router)
router.include_router(cache_routes.router)
service = StoryWriterService()
scene_logger = get_service_logger("api.story_writer.scene_animation")
AI_VIDEO_SUBDIR = Path("AI_Videos")
def _build_authenticated_media_url(request: Request, path: str) -> str:
"""Append the caller's auth token to a media URL so <video>/<img> tags can access it."""
if not path:
return path
token: Optional[str] = None
auth_header = request.headers.get("Authorization")
if auth_header and auth_header.startswith("Bearer "):
token = auth_header.replace("Bearer ", "").strip()
elif "token" in request.query_params:
token = request.query_params["token"]
if token:
separator = "&" if "?" in path else "?"
path = f"{path}{separator}token={quote(token)}"
return path
def _guess_mime_from_url(url: str, fallback: str) -> str:
if not url:
return fallback
mime, _ = mimetypes.guess_type(url)
return mime or fallback
@router.get("/health")
@@ -558,6 +607,22 @@ async def get_task_result(
logger.error(f"[StoryWriter] Failed to get task result: {e}")
raise HTTPException(status_code=500, detail=str(e))
class PromptOptimizeRequest(BaseModel):
text: str = Field(..., description="The prompt text to optimize")
mode: Optional[str] = Field(default="image", pattern="^(image|video)$", description="Optimization mode: 'image' or 'video'")
style: Optional[str] = Field(
default="default",
pattern="^(default|artistic|photographic|technical|anime|realistic)$",
description="Style: 'default', 'artistic', 'photographic', 'technical', 'anime', or 'realistic'"
)
image: Optional[str] = Field(None, description="Base64-encoded image for context (optional)")
class PromptOptimizeResponse(BaseModel):
optimized_prompt: str
success: bool
class HDVideoRequest(BaseModel):
prompt: str
provider: str = "huggingface"
@@ -692,6 +757,51 @@ async def generate_scene_images(
raise HTTPException(status_code=500, detail=str(e))
@router.post("/optimize-prompt", response_model=PromptOptimizeResponse)
async def optimize_prompt(
request: PromptOptimizeRequest,
current_user: Dict[str, Any] = Depends(get_current_user)
) -> PromptOptimizeResponse:
"""Optimize an image prompt using WaveSpeed prompt optimizer."""
try:
if not current_user:
raise HTTPException(status_code=401, detail="Authentication required")
user_id = str(current_user.get('id', ''))
if not user_id:
raise HTTPException(status_code=401, detail="Invalid user ID in authentication token")
if not request.text or not request.text.strip():
raise HTTPException(status_code=400, detail="Prompt text is required")
logger.info(f"[StoryWriter] Optimizing prompt for user {user_id} (mode={request.mode}, style={request.style})")
from services.wavespeed.client import WaveSpeedClient
client = WaveSpeedClient()
optimized_prompt = client.optimize_prompt(
text=request.text.strip(),
mode=request.mode or "image",
style=request.style or "default",
image=request.image, # Optional base64 image
enable_sync_mode=True,
timeout=30
)
logger.info(f"[StoryWriter] Prompt optimized successfully for user {user_id}")
return PromptOptimizeResponse(
optimized_prompt=optimized_prompt,
success=True
)
except HTTPException:
raise
except Exception as e:
logger.error(f"[StoryWriter] Failed to optimize prompt: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.get("/images/{image_filename}")
async def serve_scene_image(
image_filename: str,
@@ -793,32 +903,376 @@ async def generate_scene_audio(
raise HTTPException(status_code=500, detail=str(e))
@router.get("/audio/{audio_filename}")
async def serve_scene_audio(
audio_filename: str,
current_user: Dict[str, Any] = Depends(get_current_user)
):
"""Serve a generated story scene audio file."""
# Audio serving endpoint is handled by routes/media_generation.py
# No duplicate endpoint needed here
# ---------------------------
# Scene Animation Endpoints
# ---------------------------
@router.post("/animate-scene-preview", response_model=AnimateSceneResponse)
async def animate_scene_preview(
request_obj: Request,
request: AnimateSceneRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> AnimateSceneResponse:
"""
Animate a single scene image using WaveSpeed Kling v2.5 Turbo Std.
"""
if not current_user:
raise HTTPException(status_code=401, detail="Authentication required")
user_id = str(current_user.get("id", ""))
if not user_id:
raise HTTPException(status_code=401, detail="Invalid user ID in authentication token")
duration = request.duration or 5
if duration not in (5, 10):
raise HTTPException(status_code=400, detail="Duration must be 5 or 10 seconds.")
scene_logger.info(
"[AnimateScene] User=%s scene=%s duration=%s image_url=%s",
user_id,
request.scene_number,
duration,
request.image_url,
)
image_bytes = load_story_image_bytes(request.image_url)
if not image_bytes:
scene_logger.warning("[AnimateScene] Missing image bytes for user=%s scene=%s", user_id, request.scene_number)
raise HTTPException(status_code=404, detail="Scene image not found. Generate images first.")
db = next(get_db())
try:
require_authenticated_user(current_user)
pricing_service = PricingService(db)
validate_scene_animation_operation(pricing_service=pricing_service, user_id=user_id)
finally:
db.close()
from services.story_writer.audio_generation_service import StoryAudioGenerationService
from fastapi.responses import FileResponse
animation_result = animate_scene_image(
image_bytes=image_bytes,
scene_data=request.scene_data,
story_context=request.story_context,
user_id=user_id,
duration=duration,
)
audio_service = StoryAudioGenerationService()
audio_path = resolve_media_file(audio_service.output_dir, audio_filename)
base_dir = Path(__file__).parent.parent.parent
ai_video_dir = base_dir / "story_videos" / AI_VIDEO_SUBDIR
ai_video_dir.mkdir(parents=True, exist_ok=True)
video_service = StoryVideoGenerationService(output_dir=str(ai_video_dir))
return FileResponse(
path=str(audio_path),
media_type="audio/mpeg",
filename=audio_filename
save_result = video_service.save_scene_video(
video_bytes=animation_result["video_bytes"],
scene_number=request.scene_number,
user_id=user_id,
)
video_filename = save_result["video_filename"]
video_url = _build_authenticated_media_url(
request_obj, f"/api/story/videos/ai/{video_filename}"
)
usage_info = track_video_usage(
user_id=user_id,
provider=animation_result["provider"],
model_name=animation_result["model_name"],
prompt=animation_result["prompt"],
video_bytes=animation_result["video_bytes"],
cost_override=animation_result["cost"],
)
if usage_info:
scene_logger.warning(
"[AnimateScene] Video usage tracked user=%s: %s%s / %s (cost +$%.2f, total=$%.2f)",
user_id,
usage_info.get("previous_calls"),
usage_info.get("current_calls"),
usage_info.get("video_limit_display"),
usage_info.get("cost_per_video", 0.0),
usage_info.get("total_video_cost", 0.0),
)
except HTTPException:
raise
except Exception as e:
logger.error(f"[StoryWriter] Failed to serve audio: {e}")
raise HTTPException(status_code=500, detail=str(e))
scene_logger.info(
"[AnimateScene] ✅ Completed user=%s scene=%s duration=%s cost=$%.2f video=%s",
user_id,
request.scene_number,
animation_result["duration"],
animation_result["cost"],
video_url,
)
return AnimateSceneResponse(
success=True,
scene_number=request.scene_number,
video_filename=video_filename,
video_url=video_url,
duration=animation_result["duration"],
cost=animation_result["cost"],
prompt_used=animation_result["prompt"],
provider=animation_result["provider"],
prediction_id=animation_result.get("prediction_id"),
)
@router.post("/animate-scene-resume", response_model=AnimateSceneResponse)
async def resume_scene_animation_endpoint(
request_obj: Request,
request: ResumeSceneAnimationRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> AnimateSceneResponse:
"""Resume downloading a WaveSpeed animation when the initial call timed out."""
if not current_user:
raise HTTPException(status_code=401, detail="Authentication required")
user_id = str(current_user.get("id", ""))
if not user_id:
raise HTTPException(status_code=401, detail="Invalid user ID in authentication token")
scene_logger.info(
"[AnimateScene] Resume requested user=%s scene=%s prediction=%s",
user_id,
request.scene_number,
request.prediction_id,
)
animation_result = resume_scene_animation(
prediction_id=request.prediction_id,
duration=request.duration or 5,
user_id=user_id,
)
base_dir = Path(__file__).parent.parent.parent
ai_video_dir = base_dir / "story_videos" / AI_VIDEO_SUBDIR
ai_video_dir.mkdir(parents=True, exist_ok=True)
video_service = StoryVideoGenerationService(output_dir=str(ai_video_dir))
save_result = video_service.save_scene_video(
video_bytes=animation_result["video_bytes"],
scene_number=request.scene_number,
user_id=user_id,
)
video_filename = save_result["video_filename"]
video_url = _build_authenticated_media_url(
request_obj, f"/api/story/videos/ai/{video_filename}"
)
usage_info = track_video_usage(
user_id=user_id,
provider=animation_result["provider"],
model_name=animation_result["model_name"],
prompt=animation_result["prompt"],
video_bytes=animation_result["video_bytes"],
cost_override=animation_result["cost"],
)
if usage_info:
scene_logger.warning(
"[AnimateScene] (Resume) Video usage tracked user=%s: %s%s / %s (cost +$%.2f, total=$%.2f)",
user_id,
usage_info.get("previous_calls"),
usage_info.get("current_calls"),
usage_info.get("video_limit_display"),
usage_info.get("cost_per_video", 0.0),
usage_info.get("total_video_cost", 0.0),
)
scene_logger.info(
"[AnimateScene] ✅ Resume completed user=%s scene=%s prediction=%s video=%s",
user_id,
request.scene_number,
request.prediction_id,
video_url,
)
return AnimateSceneResponse(
success=True,
scene_number=request.scene_number,
video_filename=video_filename,
video_url=video_url,
duration=animation_result["duration"],
cost=animation_result["cost"],
prompt_used=animation_result["prompt"],
provider=animation_result["provider"],
prediction_id=animation_result.get("prediction_id"),
)
@router.post("/animate-scene-voiceover", response_model=Dict[str, Any])
async def animate_scene_voiceover_endpoint(
request_obj: Request,
request: AnimateSceneVoiceoverRequest,
background_tasks: BackgroundTasks,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> Dict[str, Any]:
"""
Animate a scene using WaveSpeed InfiniteTalk (image + audio) asynchronously.
Returns task_id for polling since InfiniteTalk can take up to 10 minutes.
"""
if not current_user:
raise HTTPException(status_code=401, detail="Authentication required")
user_id = str(current_user.get("id", ""))
if not user_id:
raise HTTPException(status_code=401, detail="Invalid user ID in authentication token")
scene_logger.info(
"[AnimateSceneVoiceover] User=%s scene=%s resolution=%s (async)",
user_id,
request.scene_number,
request.resolution or "720p",
)
image_bytes = load_story_image_bytes(request.image_url)
if not image_bytes:
raise HTTPException(status_code=404, detail="Scene image not found. Generate images first.")
audio_bytes = load_story_audio_bytes(request.audio_url)
if not audio_bytes:
raise HTTPException(status_code=404, detail="Scene audio not found. Generate audio first.")
db = next(get_db())
try:
pricing_service = PricingService(db)
validate_scene_animation_operation(pricing_service=pricing_service, user_id=user_id)
finally:
db.close()
# Extract token for authenticated URL building (if needed)
auth_token = None
auth_header = request_obj.headers.get("Authorization")
if auth_header and auth_header.startswith("Bearer "):
auth_token = auth_header.replace("Bearer ", "").strip()
# Create async task
task_id = task_manager.create_task("scene_voiceover_animation")
background_tasks.add_task(
_execute_voiceover_animation_task,
task_id=task_id,
request=request,
user_id=user_id,
image_bytes=image_bytes,
audio_bytes=audio_bytes,
auth_token=auth_token,
)
return {
"task_id": task_id,
"status": "pending",
"message": "InfiniteTalk animation started. This may take up to 10 minutes.",
}
def _execute_voiceover_animation_task(
task_id: str,
request: AnimateSceneVoiceoverRequest,
user_id: str,
image_bytes: bytes,
audio_bytes: bytes,
auth_token: Optional[str] = None,
):
"""Background task to generate InfiniteTalk video with progress updates."""
try:
task_manager.update_task_status(
task_id, "processing", progress=5.0, message="Submitting to WaveSpeed InfiniteTalk..."
)
animation_result = animate_scene_with_voiceover(
image_bytes=image_bytes,
audio_bytes=audio_bytes,
scene_data=request.scene_data,
story_context=request.story_context,
user_id=user_id,
resolution=request.resolution or "720p",
prompt_override=request.prompt,
image_mime=_guess_mime_from_url(request.image_url, "image/png"),
audio_mime=_guess_mime_from_url(request.audio_url, "audio/mpeg"),
)
task_manager.update_task_status(
task_id, "processing", progress=80.0, message="Saving video file..."
)
base_dir = Path(__file__).parent.parent.parent
ai_video_dir = base_dir / "story_videos" / AI_VIDEO_SUBDIR
ai_video_dir.mkdir(parents=True, exist_ok=True)
video_service = StoryVideoGenerationService(output_dir=str(ai_video_dir))
save_result = video_service.save_scene_video(
video_bytes=animation_result["video_bytes"],
scene_number=request.scene_number,
user_id=user_id,
)
video_filename = save_result["video_filename"]
# Build authenticated URL if token provided, otherwise return plain URL
video_url = f"/api/story/videos/ai/{video_filename}"
if auth_token:
video_url = f"{video_url}?token={quote(auth_token)}"
usage_info = track_video_usage(
user_id=user_id,
provider=animation_result["provider"],
model_name=animation_result["model_name"],
prompt=animation_result["prompt"],
video_bytes=animation_result["video_bytes"],
cost_override=animation_result["cost"],
)
if usage_info:
scene_logger.warning(
"[AnimateSceneVoiceover] Video usage tracked user=%s: %s%s / %s (cost +$%.2f, total=$%.2f)",
user_id,
usage_info.get("previous_calls"),
usage_info.get("current_calls"),
usage_info.get("video_limit_display"),
usage_info.get("cost_per_video", 0.0),
usage_info.get("total_video_cost", 0.0),
)
scene_logger.info(
"[AnimateSceneVoiceover] ✅ Completed user=%s scene=%s cost=$%.2f video=%s",
user_id,
request.scene_number,
animation_result["cost"],
video_url,
)
result = AnimateSceneResponse(
success=True,
scene_number=request.scene_number,
video_filename=video_filename,
video_url=video_url,
duration=animation_result["duration"],
cost=animation_result["cost"],
prompt_used=animation_result["prompt"],
provider=animation_result["provider"],
prediction_id=animation_result.get("prediction_id"),
)
task_manager.update_task_status(
task_id,
"completed",
progress=100.0,
message="InfiniteTalk animation complete!",
result=result.dict(),
)
except HTTPException as exc:
error_msg = str(exc.detail) if isinstance(exc.detail, str) else exc.detail.get("error", "Animation failed") if isinstance(exc.detail, dict) else "Animation failed"
scene_logger.error(f"[AnimateSceneVoiceover] Failed: {error_msg}")
task_manager.update_task_status(
task_id,
"failed",
error=error_msg,
message=f"InfiniteTalk animation failed: {error_msg}",
)
except Exception as exc:
error_msg = str(exc)
scene_logger.error(f"[AnimateSceneVoiceover] Error: {error_msg}", exc_info=True)
task_manager.update_task_status(
task_id,
"failed",
error=error_msg,
message=f"InfiniteTalk animation error: {error_msg}",
)
# ---------------------------
@@ -1260,19 +1714,25 @@ def execute_complete_video_generation(
)
@router.get("/videos/{video_filename}")
async def serve_story_video(
# Regular video serving endpoint is handled by routes/video_generation.py
# Only AI videos need a separate endpoint here
@router.get("/videos/ai/{video_filename}")
async def serve_ai_story_video(
video_filename: str,
current_user: Dict[str, Any] = Depends(get_current_user)
):
"""Serve a generated story video file."""
"""Serve a generated AI scene animation video."""
try:
require_authenticated_user(current_user)
from services.story_writer.video_generation_service import StoryVideoGenerationService
from fastapi.responses import FileResponse
video_service = StoryVideoGenerationService()
base_dir = Path(__file__).parent.parent.parent
ai_video_dir = (base_dir / "story_videos" / "AI_Videos").resolve()
video_service = StoryVideoGenerationService(output_dir=str(ai_video_dir))
video_path = resolve_media_file(video_service.output_dir, video_filename)
return FileResponse(
@@ -1284,7 +1744,7 @@ async def serve_story_video(
except HTTPException:
raise
except Exception as e:
logger.error(f"[StoryWriter] Failed to serve video: {e}")
logger.error(f"[StoryWriter] Failed to serve AI video: {e}")
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -0,0 +1,21 @@
"""
Collection of modular routers for Story Writer endpoints.
Each module focuses on a related set of routes to keep the primary
`router.py` concise and easier to maintain.
"""
from . import story_setup
from . import story_content
from . import story_tasks
from . import media_generation
from . import video_generation
from . import cache_routes
__all__ = [
"story_setup",
"story_content",
"story_tasks",
"media_generation",
"video_generation",
"cache_routes",
]

View File

@@ -0,0 +1,42 @@
from typing import Any, Dict
from fastapi import APIRouter, Depends, HTTPException
from loguru import logger
from middleware.auth_middleware import get_current_user
from ..cache_manager import cache_manager
from ..utils.auth import require_authenticated_user
router = APIRouter()
@router.get("/cache/stats")
async def get_cache_stats(
current_user: Dict[str, Any] = Depends(get_current_user),
) -> Dict[str, Any]:
"""Get cache statistics."""
try:
require_authenticated_user(current_user)
stats = cache_manager.get_cache_stats()
return {"success": True, "stats": stats}
except Exception as exc:
logger.error(f"[StoryWriter] Failed to get cache stats: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
@router.post("/cache/clear")
async def clear_cache(
current_user: Dict[str, Any] = Depends(get_current_user),
) -> Dict[str, Any]:
"""Clear the story generation cache."""
try:
require_authenticated_user(current_user)
result = cache_manager.clear_cache()
return {"success": True, **result}
except Exception as exc:
logger.error(f"[StoryWriter] Failed to clear cache: {exc}")
raise HTTPException(status_code=500, detail=str(exc))

View File

@@ -0,0 +1,289 @@
from typing import Any, Dict, List
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import FileResponse
from loguru import logger
from middleware.auth_middleware import get_current_user, get_current_user_with_query_token
from models.story_models import (
StoryImageGenerationRequest,
StoryImageGenerationResponse,
StoryImageResult,
RegenerateImageRequest,
RegenerateImageResponse,
StoryAudioGenerationRequest,
StoryAudioGenerationResponse,
StoryAudioResult,
GenerateAIAudioRequest,
GenerateAIAudioResponse,
StoryScene,
)
from services.story_writer.image_generation_service import StoryImageGenerationService
from services.story_writer.audio_generation_service import StoryAudioGenerationService
from ..utils.auth import require_authenticated_user
from ..utils.media_utils import resolve_media_file
router = APIRouter()
image_service = StoryImageGenerationService()
audio_service = StoryAudioGenerationService()
@router.post("/generate-images", response_model=StoryImageGenerationResponse)
async def generate_scene_images(
request: StoryImageGenerationRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> StoryImageGenerationResponse:
"""Generate images for story scenes."""
try:
user_id = require_authenticated_user(current_user)
if not request.scenes or len(request.scenes) == 0:
raise HTTPException(status_code=400, detail="At least one scene is required")
logger.info(f"[StoryWriter] Generating images for {len(request.scenes)} scenes for user {user_id}")
scenes_data = [scene.dict() if isinstance(scene, StoryScene) else scene for scene in request.scenes]
image_results = image_service.generate_scene_images(
scenes=scenes_data,
user_id=user_id,
provider=request.provider,
width=request.width or 1024,
height=request.height or 1024,
model=request.model,
)
image_models: List[StoryImageResult] = [
StoryImageResult(
scene_number=result.get("scene_number", 0),
scene_title=result.get("scene_title", "Untitled"),
image_filename=result.get("image_filename", ""),
image_url=result.get("image_url", ""),
width=result.get("width", 1024),
height=result.get("height", 1024),
provider=result.get("provider", "unknown"),
model=result.get("model"),
seed=result.get("seed"),
error=result.get("error"),
)
for result in image_results
]
return StoryImageGenerationResponse(images=image_models, success=True)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to generate images: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
@router.post("/regenerate-images", response_model=RegenerateImageResponse)
async def regenerate_scene_image(
request: RegenerateImageRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> RegenerateImageResponse:
"""Regenerate a single scene image using a direct prompt (no AI prompt generation)."""
try:
user_id = require_authenticated_user(current_user)
if not request.prompt or not request.prompt.strip():
raise HTTPException(status_code=400, detail="Prompt is required")
logger.info(
f"[StoryWriter] Regenerating image for scene {request.scene_number} "
f"({request.scene_title}) for user {user_id}"
)
result = image_service.regenerate_scene_image(
scene_number=request.scene_number,
scene_title=request.scene_title,
prompt=request.prompt.strip(),
user_id=user_id,
provider=request.provider,
width=request.width or 1024,
height=request.height or 1024,
model=request.model,
)
return RegenerateImageResponse(
scene_number=result.get("scene_number", request.scene_number),
scene_title=result.get("scene_title", request.scene_title),
image_filename=result.get("image_filename", ""),
image_url=result.get("image_url", ""),
width=result.get("width", request.width or 1024),
height=result.get("height", request.height or 1024),
provider=result.get("provider", "unknown"),
model=result.get("model"),
seed=result.get("seed"),
success=True,
)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to regenerate image: {exc}")
return RegenerateImageResponse(
scene_number=request.scene_number,
scene_title=request.scene_title,
image_filename="",
image_url="",
width=request.width or 1024,
height=request.height or 1024,
provider=request.provider or "unknown",
success=False,
error=str(exc),
)
@router.get("/images/{image_filename}")
async def serve_scene_image(
image_filename: str,
current_user: Dict[str, Any] = Depends(get_current_user_with_query_token),
):
"""Serve a generated story scene image.
Supports authentication via Authorization header or token query parameter.
Query parameter is useful for HTML elements like <img> that cannot send custom headers.
"""
try:
require_authenticated_user(current_user)
image_path = resolve_media_file(image_service.output_dir, image_filename)
return FileResponse(path=str(image_path), media_type="image/png", filename=image_filename)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to serve image: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
@router.post("/generate-audio", response_model=StoryAudioGenerationResponse)
async def generate_scene_audio(
request: StoryAudioGenerationRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> StoryAudioGenerationResponse:
"""Generate audio narration for story scenes."""
try:
user_id = require_authenticated_user(current_user)
if not request.scenes or len(request.scenes) == 0:
raise HTTPException(status_code=400, detail="At least one scene is required")
logger.info(f"[StoryWriter] Generating audio for {len(request.scenes)} scenes for user {user_id}")
scenes_data = [scene.dict() if isinstance(scene, StoryScene) else scene for scene in request.scenes]
audio_results = audio_service.generate_scene_audio_list(
scenes=scenes_data,
user_id=user_id,
provider=request.provider or "gtts",
lang=request.lang or "en",
slow=request.slow or False,
rate=request.rate or 150,
)
audio_models: List[StoryAudioResult] = []
for result in audio_results:
audio_models.append(
StoryAudioResult(
scene_number=result.get("scene_number", 0),
scene_title=result.get("scene_title", "Untitled"),
audio_filename=result.get("audio_filename") or "",
audio_url=result.get("audio_url") or "",
provider=result.get("provider", "unknown"),
file_size=result.get("file_size", 0),
error=result.get("error"),
)
)
return StoryAudioGenerationResponse(audio_files=audio_models, success=True)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to generate audio: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
@router.post("/generate-ai-audio", response_model=GenerateAIAudioResponse)
async def generate_ai_audio(
request: GenerateAIAudioRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> GenerateAIAudioResponse:
"""Generate AI audio for a single scene using WaveSpeed Minimax Speech 02 HD."""
try:
user_id = require_authenticated_user(current_user)
if not request.text or not request.text.strip():
raise HTTPException(status_code=400, detail="Text is required")
logger.info(
f"[StoryWriter] Generating AI audio for scene {request.scene_number} "
f"({request.scene_title}) for user {user_id}"
)
result = audio_service.generate_ai_audio(
scene_number=request.scene_number,
scene_title=request.scene_title,
text=request.text.strip(),
user_id=user_id,
voice_id=request.voice_id or "Wise_Woman",
speed=request.speed or 1.0,
volume=request.volume or 1.0,
pitch=request.pitch or 0.0,
emotion=request.emotion or "happy",
)
return GenerateAIAudioResponse(
scene_number=result.get("scene_number", request.scene_number),
scene_title=result.get("scene_title", request.scene_title),
audio_filename=result.get("audio_filename", ""),
audio_url=result.get("audio_url", ""),
provider=result.get("provider", "wavespeed"),
model=result.get("model", "minimax/speech-02-hd"),
voice_id=result.get("voice_id", request.voice_id or "Wise_Woman"),
text_length=result.get("text_length", len(request.text)),
file_size=result.get("file_size", 0),
cost=result.get("cost", 0.0),
success=True,
)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to generate AI audio: {exc}")
return GenerateAIAudioResponse(
scene_number=request.scene_number,
scene_title=request.scene_title,
audio_filename="",
audio_url="",
provider="wavespeed",
model="minimax/speech-02-hd",
voice_id=request.voice_id or "Wise_Woman",
text_length=len(request.text) if request.text else 0,
file_size=0,
cost=0.0,
success=False,
error=str(exc),
)
@router.get("/audio/{audio_filename}")
async def serve_scene_audio(
audio_filename: str,
current_user: Dict[str, Any] = Depends(get_current_user),
):
"""Serve a generated story scene audio file."""
try:
require_authenticated_user(current_user)
audio_path = resolve_media_file(audio_service.output_dir, audio_filename)
return FileResponse(path=str(audio_path), media_type="audio/mpeg", filename=audio_filename)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to serve audio: {exc}")
raise HTTPException(status_code=500, detail=str(exc))

View File

@@ -0,0 +1,195 @@
from typing import Any, Dict, List
from fastapi import APIRouter, Depends, HTTPException
from loguru import logger
from middleware.auth_middleware import get_current_user
from models.story_models import (
StoryStartRequest,
StoryContentResponse,
StoryScene,
StoryContinueRequest,
StoryContinueResponse,
)
from services.story_writer.story_service import StoryWriterService
from ..utils.auth import require_authenticated_user
router = APIRouter()
story_service = StoryWriterService()
@router.post("/generate-start", response_model=StoryContentResponse)
async def generate_story_start(
request: StoryStartRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> StoryContentResponse:
"""Generate the starting section of a story."""
try:
user_id = require_authenticated_user(current_user)
if not request.premise or not request.premise.strip():
raise HTTPException(status_code=400, detail="Premise is required")
if not request.outline or (isinstance(request.outline, str) and not request.outline.strip()):
raise HTTPException(status_code=400, detail="Outline is required")
logger.info(f"[StoryWriter] Generating story start for user {user_id}")
outline_data: Any = request.outline
if isinstance(outline_data, list) and outline_data and isinstance(outline_data[0], StoryScene):
outline_data = [scene.dict() for scene in outline_data]
story_length = getattr(request, "story_length", "Medium")
story_start = story_service.generate_story_start(
premise=request.premise,
outline=outline_data,
persona=request.persona,
story_setting=request.story_setting,
character_input=request.character_input,
plot_elements=request.plot_elements,
writing_style=request.writing_style,
story_tone=request.story_tone,
narrative_pov=request.narrative_pov,
audience_age_group=request.audience_age_group,
content_rating=request.content_rating,
ending_preference=request.ending_preference,
story_length=story_length,
user_id=user_id,
)
story_length_lower = story_length.lower()
is_short_story = "short" in story_length_lower or "1000" in story_length_lower
is_complete = False
if is_short_story:
word_count = len(story_start.split()) if story_start else 0
if word_count >= 900:
is_complete = True
logger.info(
f"[StoryWriter] Short story generated with {word_count} words. Marking as complete."
)
else:
logger.warning(
f"[StoryWriter] Short story generated with only {word_count} words. May need continuation."
)
outline_response = outline_data
if isinstance(outline_response, list):
outline_response = "\n".join(
[
f"Scene {scene.get('scene_number', i + 1)}: "
f"{scene.get('title', 'Untitled')}\n {scene.get('description', '')}"
for i, scene in enumerate(outline_response)
]
)
return StoryContentResponse(
story=story_start,
premise=request.premise,
outline=str(outline_response),
is_complete=is_complete,
success=True,
)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to generate story start: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
@router.post("/continue", response_model=StoryContinueResponse)
async def continue_story(
request: StoryContinueRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> StoryContinueResponse:
"""Continue writing a story."""
try:
user_id = require_authenticated_user(current_user)
if not request.story_text or not request.story_text.strip():
raise HTTPException(status_code=400, detail="Story text is required")
logger.info(f"[StoryWriter] Continuing story for user {user_id}")
outline_data: Any = request.outline
if isinstance(outline_data, list) and outline_data and isinstance(outline_data[0], StoryScene):
outline_data = [scene.dict() for scene in outline_data]
story_length = getattr(request, "story_length", "Medium")
story_length_lower = story_length.lower()
is_short_story = "short" in story_length_lower or "1000" in story_length_lower
if is_short_story:
logger.warning(
"[StoryWriter] Attempted to continue a short story. Short stories should be complete in one call."
)
raise HTTPException(
status_code=400,
detail="Short stories are generated in a single call and should be complete. "
"If the story is incomplete, please regenerate it from the beginning.",
)
current_word_count = len(request.story_text.split()) if request.story_text else 0
if "long" in story_length_lower or "10000" in story_length_lower:
target_total_words = 10000
else:
target_total_words = 4500
buffer_target = int(target_total_words * 1.05)
if current_word_count >= buffer_target or (
current_word_count >= target_total_words
and (current_word_count - target_total_words) < 50
):
logger.info(
f"[StoryWriter] Word count ({current_word_count}) already at or near target ({target_total_words})."
)
return StoryContinueResponse(continuation="IAMDONE", is_complete=True, success=True)
continuation = story_service.continue_story(
premise=request.premise,
outline=outline_data,
story_text=request.story_text,
persona=request.persona,
story_setting=request.story_setting,
character_input=request.character_input,
plot_elements=request.plot_elements,
writing_style=request.writing_style,
story_tone=request.story_tone,
narrative_pov=request.narrative_pov,
audience_age_group=request.audience_age_group,
content_rating=request.content_rating,
ending_preference=request.ending_preference,
story_length=story_length,
user_id=user_id,
)
is_complete = "IAMDONE" in continuation.upper()
if not is_complete and continuation:
new_story_text = request.story_text + "\n\n" + continuation
new_word_count = len(new_story_text.split())
if new_word_count >= buffer_target:
logger.info(
f"[StoryWriter] Word count ({new_word_count}) now exceeds buffer target ({buffer_target})."
)
if "IAMDONE" not in continuation.upper():
continuation = continuation.rstrip() + "\n\nIAMDONE"
is_complete = True
elif new_word_count >= target_total_words and (
new_word_count - target_total_words
) < 100:
logger.info(
f"[StoryWriter] Word count ({new_word_count}) is at or very close to target ({target_total_words})."
)
if "IAMDONE" not in continuation.upper():
continuation = continuation.rstrip() + "\n\nIAMDONE"
is_complete = True
return StoryContinueResponse(continuation=continuation, is_complete=is_complete, success=True)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to continue story: {exc}")
raise HTTPException(status_code=500, detail=str(exc))

View File

@@ -0,0 +1,141 @@
from typing import Any, Dict, List
from fastapi import APIRouter, Depends, HTTPException
from loguru import logger
from middleware.auth_middleware import get_current_user
from models.story_models import (
StorySetupGenerationRequest,
StorySetupGenerationResponse,
StorySetupOption,
StoryGenerationRequest,
StoryOutlineResponse,
StoryScene,
StoryStartRequest,
StoryPremiseResponse,
)
from services.story_writer.story_service import StoryWriterService
from ..utils.auth import require_authenticated_user
router = APIRouter()
story_service = StoryWriterService()
@router.post("/generate-setup", response_model=StorySetupGenerationResponse)
async def generate_story_setup(
request: StorySetupGenerationRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> StorySetupGenerationResponse:
"""Generate 3 story setup options from a user's story idea."""
try:
user_id = require_authenticated_user(current_user)
if not request.story_idea or not request.story_idea.strip():
raise HTTPException(status_code=400, detail="Story idea is required")
logger.info(f"[StoryWriter] Generating story setup options for user {user_id}")
options = story_service.generate_story_setup_options(
story_idea=request.story_idea,
user_id=user_id,
)
setup_options = [StorySetupOption(**option) for option in options]
return StorySetupGenerationResponse(options=setup_options, success=True)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to generate story setup options: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
@router.post("/generate-premise", response_model=StoryPremiseResponse)
async def generate_premise(
request: StoryGenerationRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> StoryPremiseResponse:
"""Generate a story premise."""
try:
user_id = require_authenticated_user(current_user)
logger.info(f"[StoryWriter] Generating premise for user {user_id}")
premise = story_service.generate_premise(
persona=request.persona,
story_setting=request.story_setting,
character_input=request.character_input,
plot_elements=request.plot_elements,
writing_style=request.writing_style,
story_tone=request.story_tone,
narrative_pov=request.narrative_pov,
audience_age_group=request.audience_age_group,
content_rating=request.content_rating,
ending_preference=request.ending_preference,
user_id=user_id,
)
return StoryPremiseResponse(premise=premise, success=True)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to generate premise: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
@router.post("/generate-outline", response_model=StoryOutlineResponse)
async def generate_outline(
request: StoryStartRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
use_structured: bool = True,
) -> StoryOutlineResponse:
"""Generate a story outline from a premise."""
try:
user_id = require_authenticated_user(current_user)
if not request.premise or not request.premise.strip():
raise HTTPException(status_code=400, detail="Premise is required")
logger.info(
f"[StoryWriter] Generating outline for user {user_id} (structured={use_structured})"
)
logger.info(
"[StoryWriter] Outline params: audience_age_group=%s, writing_style=%s, story_tone=%s",
request.audience_age_group,
request.writing_style,
request.story_tone,
)
outline = story_service.generate_outline(
premise=request.premise,
persona=request.persona,
story_setting=request.story_setting,
character_input=request.character_input,
plot_elements=request.plot_elements,
writing_style=request.writing_style,
story_tone=request.story_tone,
narrative_pov=request.narrative_pov,
audience_age_group=request.audience_age_group,
content_rating=request.content_rating,
ending_preference=request.ending_preference,
user_id=user_id,
use_structured_output=use_structured,
)
if isinstance(outline, list):
scenes: List[StoryScene] = [
StoryScene(**scene) if isinstance(scene, dict) else scene for scene in outline
]
return StoryOutlineResponse(outline=scenes, success=True, is_structured=True)
return StoryOutlineResponse(outline=str(outline), success=True, is_structured=False)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to generate outline: {exc}")
raise HTTPException(status_code=500, detail=str(exc))

View File

@@ -0,0 +1,130 @@
from typing import Any, Dict
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from loguru import logger
from middleware.auth_middleware import get_current_user
from models.story_models import (
StoryGenerationRequest,
TaskStatus,
)
from services.story_writer.story_service import StoryWriterService
from ..cache_manager import cache_manager
from ..task_manager import task_manager
from ..utils.auth import require_authenticated_user
router = APIRouter()
story_service = StoryWriterService()
@router.post("/generate-full", response_model=Dict[str, Any])
async def generate_full_story(
request: StoryGenerationRequest,
background_tasks: BackgroundTasks,
current_user: Dict[str, Any] = Depends(get_current_user),
max_iterations: int = 10,
) -> Dict[str, Any]:
"""Generate a complete story asynchronously."""
try:
user_id = require_authenticated_user(current_user)
cache_key = cache_manager.get_cache_key(request.dict())
cached_result = cache_manager.get_cached_result(cache_key)
if cached_result:
logger.info(f"[StoryWriter] Returning cached result for user {user_id}")
task_id = task_manager.create_task("story_generation")
task_manager.update_task_status(
task_id,
"completed",
progress=100.0,
result=cached_result,
message="Returned cached result",
)
return {"task_id": task_id, "cached": True}
task_id = task_manager.create_task("story_generation")
request_data = request.dict()
request_data["max_iterations"] = max_iterations
background_tasks.add_task(
task_manager.execute_story_generation_task,
task_id=task_id,
request_data=request_data,
user_id=user_id,
)
logger.info(f"[StoryWriter] Created task {task_id} for full story generation (user {user_id})")
return {
"task_id": task_id,
"status": "pending",
"message": "Story generation started. Use /task/{task_id}/status to check progress.",
}
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to start story generation: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
@router.get("/task/{task_id}/status", response_model=TaskStatus)
async def get_task_status(
task_id: str,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> TaskStatus:
"""Get the status of a story generation task."""
try:
require_authenticated_user(current_user)
task_status = task_manager.get_task_status(task_id)
if not task_status:
raise HTTPException(status_code=404, detail=f"Task {task_id} not found")
return TaskStatus(**task_status)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to get task status: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
@router.get("/task/{task_id}/result")
async def get_task_result(
task_id: str,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> Dict[str, Any]:
"""Get the result of a completed story generation task."""
try:
require_authenticated_user(current_user)
task_status = task_manager.get_task_status(task_id)
if not task_status:
raise HTTPException(status_code=404, detail=f"Task {task_id} not found")
if task_status["status"] != "completed":
raise HTTPException(
status_code=400,
detail=f"Task {task_id} is not completed. Status: {task_status['status']}",
)
result = task_status.get("result")
if not result:
raise HTTPException(status_code=404, detail=f"No result found for task {task_id}")
if isinstance(result, dict):
payload = {**result}
payload.setdefault("success", True)
payload["task_id"] = task_id
return payload
return {"result": result, "success": True, "task_id": task_id}
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to get task result: {exc}")
raise HTTPException(status_code=500, detail=str(exc))

View File

@@ -0,0 +1,511 @@
from pathlib import Path
from typing import Any, Dict, List, Optional
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from fastapi.responses import FileResponse
from loguru import logger
from pydantic import BaseModel
from middleware.auth_middleware import get_current_user, get_current_user_with_query_token
from models.story_models import (
StoryVideoGenerationRequest,
StoryVideoGenerationResponse,
StoryVideoResult,
StoryScene,
StoryGenerationRequest,
)
from services.story_writer.video_generation_service import StoryVideoGenerationService
from services.story_writer.image_generation_service import StoryImageGenerationService
from services.story_writer.audio_generation_service import StoryAudioGenerationService
from services.story_writer.story_service import StoryWriterService
from ..task_manager import task_manager
from ..utils.auth import require_authenticated_user
from ..utils.hd_video import (
generate_hd_video_payload,
generate_hd_video_scene_payload,
)
from ..utils.media_utils import resolve_media_file
router = APIRouter()
video_service = StoryVideoGenerationService()
image_service = StoryImageGenerationService()
audio_service = StoryAudioGenerationService()
story_service = StoryWriterService()
class HDVideoRequest(BaseModel):
prompt: str
provider: str = "huggingface"
model: str | None = None
num_frames: int | None = None
guidance_scale: float | None = None
num_inference_steps: int | None = None
negative_prompt: str | None = None
seed: int | None = None
class HDVideoSceneRequest(BaseModel):
scene_number: int
scene_data: Dict[str, Any]
story_context: Dict[str, Any]
all_scenes: List[Dict[str, Any]]
provider: str = "huggingface"
model: str | None = None
num_frames: int | None = None
guidance_scale: float | None = None
num_inference_steps: int | None = None
negative_prompt: str | None = None
seed: int | None = None
@router.post("/generate-video", response_model=StoryVideoGenerationResponse)
async def generate_story_video(
request: StoryVideoGenerationRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> StoryVideoGenerationResponse:
"""Generate a video from story scenes, images, and audio."""
try:
user_id = require_authenticated_user(current_user)
if not request.scenes or len(request.scenes) == 0:
raise HTTPException(status_code=400, detail="At least one scene is required")
if len(request.scenes) != len(request.image_urls) or len(request.scenes) != len(request.audio_urls):
raise HTTPException(
status_code=400,
detail="Number of scenes, image URLs, and audio URLs must match",
)
logger.info(f"[StoryWriter] Generating video for {len(request.scenes)} scenes for user {user_id}")
scenes_data = [scene.dict() if isinstance(scene, StoryScene) else scene for scene in request.scenes]
video_paths: List[Optional[str]] = [] # Animated videos (preferred)
image_paths: List[Optional[str]] = [] # Static images (fallback)
audio_paths: List[str] = []
valid_scenes: List[Dict[str, Any]] = []
# Resolve video/audio directories
base_dir = Path(__file__).parent.parent.parent.parent
ai_video_dir = (base_dir / "story_videos" / "AI_Videos").resolve()
video_urls = request.video_urls or [None] * len(request.scenes)
ai_audio_urls = request.ai_audio_urls or [None] * len(request.scenes)
for idx, (scene, image_url, audio_url) in enumerate(zip(scenes_data, request.image_urls, request.audio_urls)):
# Prefer animated video if available
video_url = video_urls[idx] if idx < len(video_urls) else None
video_path = None
image_path = None
if video_url:
# Extract filename from animated video URL (e.g., /api/story/videos/ai/filename.mp4)
video_filename = video_url.split("/")[-1].split("?")[0]
video_path = ai_video_dir / video_filename
if video_path.exists():
logger.info(f"[StoryWriter] Using animated video for scene {scene.get('scene_number', idx+1)}: {video_filename}")
video_paths.append(str(video_path))
image_paths.append(None)
else:
logger.warning(f"[StoryWriter] Animated video not found: {video_path}, falling back to image")
video_paths.append(None)
video_path = None
# Fall back to image if no animated video
if not video_path:
image_filename = image_url.split("/")[-1].split("?")[0]
image_path = image_service.output_dir / image_filename
if image_path.exists():
video_paths.append(None)
image_paths.append(str(image_path))
else:
logger.warning(f"[StoryWriter] Image not found: {image_path} (from URL: {image_url})")
continue
# Prefer AI audio if available, otherwise use free audio
ai_audio_url = ai_audio_urls[idx] if idx < len(ai_audio_urls) else None
audio_filename = None
audio_path = None
if ai_audio_url:
audio_filename = ai_audio_url.split("/")[-1].split("?")[0]
audio_path = audio_service.output_dir / audio_filename
if audio_path.exists():
logger.info(f"[StoryWriter] Using AI audio for scene {scene.get('scene_number', idx+1)}: {audio_filename}")
else:
logger.warning(f"[StoryWriter] AI audio not found: {audio_path}, falling back to free audio")
audio_path = None
# Fall back to free audio if no AI audio
if not audio_path:
audio_filename = audio_url.split("/")[-1].split("?")[0]
audio_path = audio_service.output_dir / audio_filename
if not audio_path.exists():
logger.warning(f"[StoryWriter] Audio not found: {audio_path} (from URL: {audio_url})")
continue
audio_paths.append(str(audio_path))
valid_scenes.append(scene)
if len(valid_scenes) == 0 or len(audio_paths) == 0:
raise HTTPException(status_code=400, detail="No valid video/image or audio files were found")
if len(valid_scenes) != len(audio_paths):
raise HTTPException(
status_code=400,
detail="Number of valid scenes and audio files must match",
)
video_result = video_service.generate_story_video(
scenes=valid_scenes,
image_paths=image_paths, # Can contain None for scenes with animated videos
video_paths=video_paths, # Can contain None for scenes with static images
audio_paths=audio_paths,
user_id=user_id,
story_title=request.story_title or "Story",
fps=request.fps or 24,
transition_duration=request.transition_duration or 0.5,
)
video_model = StoryVideoResult(
video_filename=video_result.get("video_filename", ""),
video_url=video_result.get("video_url", ""),
duration=video_result.get("duration", 0.0),
fps=video_result.get("fps", 24),
file_size=video_result.get("file_size", 0),
num_scenes=video_result.get("num_scenes", 0),
error=video_result.get("error"),
)
return StoryVideoGenerationResponse(video=video_model, success=True)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to generate video: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
@router.post("/generate-video-async", response_model=Dict[str, Any])
async def generate_story_video_async(
request: StoryVideoGenerationRequest,
background_tasks: BackgroundTasks,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> Dict[str, Any]:
"""
Generate a video asynchronously with progress updates via task manager.
Frontend can poll /api/story/task/{task_id}/status to show progress messages.
"""
try:
user_id = require_authenticated_user(current_user)
if not request.scenes or len(request.scenes) == 0:
raise HTTPException(status_code=400, detail="At least one scene is required")
if len(request.scenes) != len(request.image_urls) or len(request.scenes) != len(request.audio_urls):
raise HTTPException(
status_code=400,
detail="Number of scenes, image URLs, and audio URLs must match",
)
task_id = task_manager.create_task("story_video_generation")
background_tasks.add_task(
_execute_video_generation_task,
task_id=task_id,
request=request,
user_id=user_id,
)
return {"task_id": task_id, "status": "pending", "message": "Video generation started"}
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to start async video generation: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
def _execute_video_generation_task(task_id: str, request: StoryVideoGenerationRequest, user_id: str):
"""Background task to generate story video with progress mapped to task manager."""
try:
task_manager.update_task_status(task_id, "processing", progress=2.0, message="Initializing video generation...")
scenes_data = [scene.dict() if isinstance(scene, StoryScene) else scene for scene in request.scenes]
image_paths: List[str] = []
audio_paths: List[str] = []
valid_scenes: List[Dict[str, Any]] = []
for scene, image_url, audio_url in zip(scenes_data, request.image_urls, request.audio_urls):
image_filename = image_url.split("/")[-1].split("?")[0]
audio_filename = audio_url.split("/")[-1].split("?")[0]
image_path = image_service.output_dir / image_filename
audio_path = audio_service.output_dir / audio_filename
if not image_path.exists():
logger.warning(f"[StoryWriter] Image not found: {image_path} (from URL: {image_url})")
continue
if not audio_path.exists():
logger.warning(f"[StoryWriter] Audio not found: {audio_path} (from URL: {audio_url})")
continue
image_paths.append(str(image_path))
audio_paths.append(str(audio_path))
valid_scenes.append(scene)
if not image_paths or not audio_paths or len(image_paths) != len(audio_paths):
raise RuntimeError("No valid or mismatched image/audio assets for video generation.")
def progress_callback(sub_progress: float, msg: str):
overall = 5.0 + max(0.0, min(100.0, sub_progress)) * 0.9
task_manager.update_task_status(task_id, "processing", progress=overall, message=msg)
result = video_service.generate_story_video(
scenes=valid_scenes,
image_paths=image_paths,
audio_paths=audio_paths,
user_id=user_id,
story_title=request.story_title or "Story",
fps=request.fps or 24,
transition_duration=request.transition_duration or 0.5,
progress_callback=progress_callback,
)
task_manager.update_task_status(
task_id,
"completed",
progress=100.0,
message="Video generation complete!",
result={"video": result, "success": True},
)
except Exception as exc:
logger.error(f"[StoryWriter] Async video generation failed: {exc}", exc_info=True)
task_manager.update_task_status(task_id, "failed", error=str(exc), message=f"Video generation failed: {exc}")
@router.post("/hd-video")
async def generate_hd_video(
request: HDVideoRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> Dict[str, Any]:
try:
user_id = require_authenticated_user(current_user)
return generate_hd_video_payload(request, user_id)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to generate HD video: {exc}", exc_info=True)
raise HTTPException(status_code=500, detail=str(exc))
@router.post("/hd-video-scene")
async def generate_hd_video_scene(
request: HDVideoSceneRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> Dict[str, Any]:
try:
user_id = require_authenticated_user(current_user)
return generate_hd_video_scene_payload(request, user_id)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to generate HD video for scene: {exc}", exc_info=True)
raise HTTPException(status_code=500, detail=str(exc))
@router.post("/generate-complete-video", response_model=Dict[str, Any])
async def generate_complete_story_video(
request: StoryGenerationRequest,
background_tasks: BackgroundTasks,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> Dict[str, Any]:
"""Generate a complete story video workflow asynchronously."""
try:
user_id = require_authenticated_user(current_user)
logger.info(f"[StoryWriter] Starting complete video generation for user {user_id}")
task_id = task_manager.create_task("complete_video_generation")
background_tasks.add_task(
execute_complete_video_generation,
task_id=task_id,
request_data=request.dict(),
user_id=user_id,
)
return {
"task_id": task_id,
"status": "pending",
"message": "Complete video generation started",
}
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to start complete video generation: {exc}")
raise HTTPException(status_code=500, detail=str(exc))
def execute_complete_video_generation(
task_id: str,
request_data: Dict[str, Any],
user_id: str,
):
"""
Execute complete video generation workflow synchronously.
Runs in a background task and performs blocking operations.
"""
try:
task_manager.update_task_status(task_id, "processing", progress=5.0, message="Starting complete video generation...")
task_manager.update_task_status(task_id, "processing", progress=10.0, message="Generating story premise...")
premise = story_service.generate_premise(
persona=request_data["persona"],
story_setting=request_data["story_setting"],
character_input=request_data["character_input"],
plot_elements=request_data["plot_elements"],
writing_style=request_data["writing_style"],
story_tone=request_data["story_tone"],
narrative_pov=request_data["narrative_pov"],
audience_age_group=request_data["audience_age_group"],
content_rating=request_data["content_rating"],
ending_preference=request_data["ending_preference"],
user_id=user_id,
)
task_manager.update_task_status(task_id, "processing", progress=20.0, message="Generating structured outline with scenes...")
outline_scenes = story_service.generate_outline(
premise=premise,
persona=request_data["persona"],
story_setting=request_data["story_setting"],
character_input=request_data["character_input"],
plot_elements=request_data["plot_elements"],
writing_style=request_data["writing_style"],
story_tone=request_data["story_tone"],
narrative_pov=request_data["narrative_pov"],
audience_age_group=request_data["audience_age_group"],
content_rating=request_data["content_rating"],
ending_preference=request_data["ending_preference"],
user_id=user_id,
use_structured_output=True,
)
if not isinstance(outline_scenes, list):
raise RuntimeError("Failed to generate structured outline")
task_manager.update_task_status(task_id, "processing", progress=30.0, message="Generating images for scenes...")
def image_progress_callback(sub_progress: float, message: str):
overall_progress = 30.0 + (sub_progress * 0.2)
task_manager.update_task_status(task_id, "processing", progress=overall_progress, message=message)
image_results = image_service.generate_scene_images(
scenes=outline_scenes,
user_id=user_id,
provider=request_data.get("image_provider"),
width=request_data.get("image_width", 1024),
height=request_data.get("image_height", 1024),
model=request_data.get("image_model"),
progress_callback=image_progress_callback,
)
task_manager.update_task_status(task_id, "processing", progress=50.0, message="Generating audio narration for scenes...")
def audio_progress_callback(sub_progress: float, message: str):
overall_progress = 50.0 + (sub_progress * 0.2)
task_manager.update_task_status(task_id, "processing", progress=overall_progress, message=message)
audio_results = audio_service.generate_scene_audio_list(
scenes=outline_scenes,
user_id=user_id,
provider=request_data.get("audio_provider", "gtts"),
lang=request_data.get("audio_lang", "en"),
slow=request_data.get("audio_slow", False),
rate=request_data.get("audio_rate", 150),
progress_callback=audio_progress_callback,
)
task_manager.update_task_status(task_id, "processing", progress=70.0, message="Preparing video assets...")
image_paths: List[str] = []
audio_paths: List[str] = []
valid_scenes: List[Dict[str, Any]] = []
for scene in outline_scenes:
scene_number = scene.get("scene_number", 0)
image_result = next((img for img in image_results if img.get("scene_number") == scene_number), None)
audio_result = next((aud for aud in audio_results if aud.get("scene_number") == scene_number), None)
if image_result and audio_result and not image_result.get("error") and not audio_result.get("error"):
image_path = image_result.get("image_path")
audio_path = audio_result.get("audio_path")
if image_path and audio_path:
image_paths.append(image_path)
audio_paths.append(audio_path)
valid_scenes.append(scene)
if len(image_paths) == 0 or len(audio_paths) == 0:
raise RuntimeError(
f"No valid images or audio files were generated. Images: {len(image_paths)}, Audio: {len(audio_paths)}"
)
if len(image_paths) != len(audio_paths):
raise RuntimeError(
f"Mismatch between image and audio counts. Images: {len(image_paths)}, Audio: {len(audio_paths)}"
)
task_manager.update_task_status(task_id, "processing", progress=75.0, message="Composing video from scenes...")
def video_progress_callback(sub_progress: float, message: str):
overall_progress = 75.0 + (sub_progress * 0.2)
task_manager.update_task_status(task_id, "processing", progress=overall_progress, message=message)
video_result = video_service.generate_story_video(
scenes=valid_scenes,
image_paths=image_paths,
audio_paths=audio_paths,
user_id=user_id,
story_title=request_data.get("story_setting", "Story")[:50],
fps=request_data.get("video_fps", 24),
transition_duration=request_data.get("video_transition_duration", 0.5),
progress_callback=video_progress_callback,
)
result = {
"premise": premise,
"outline_scenes": outline_scenes,
"images": image_results,
"audio_files": audio_results,
"video": video_result,
"success": True,
}
task_manager.update_task_status(
task_id,
"completed",
progress=100.0,
message="Complete video generation finished!",
result=result,
)
logger.info(f"[StoryWriter] Complete video generation task {task_id} completed successfully")
except Exception as exc:
error_msg = str(exc)
logger.error(f"[StoryWriter] Complete video generation task {task_id} failed: {error_msg}", exc_info=True)
task_manager.update_task_status(
task_id,
"failed",
error=error_msg,
message=f"Complete video generation failed: {error_msg}",
)
@router.get("/videos/{video_filename}")
async def serve_story_video(
video_filename: str,
current_user: Dict[str, Any] = Depends(get_current_user),
):
"""Serve a generated story video file."""
try:
require_authenticated_user(current_user)
video_path = resolve_media_file(video_service.output_dir, video_filename)
return FileResponse(path=str(video_path), media_type="video/mp4", filename=video_filename)
except HTTPException:
raise
except Exception as exc:
logger.error(f"[StoryWriter] Failed to serve video: {exc}")
raise HTTPException(status_code=500, detail=str(exc))

View File

@@ -1,13 +1,11 @@
from __future__ import annotations
from typing import Any, Dict, Optional
from typing import Any, Dict
from fastapi import HTTPException
from loguru import logger
from uuid import uuid4
from .media_utils import load_story_image_bytes
def generate_hd_video_payload(request: Any, user_id: str) -> Dict[str, Any]:
"""Handles synchronous HD video generation."""
@@ -57,8 +55,8 @@ def generate_hd_video_payload(request: Any, user_id: str) -> Dict[str, Any]:
def generate_hd_video_scene_payload(request: Any, user_id: str) -> Dict[str, Any]:
"""
Handles per-scene HD video generation including prompt enhancement,
subscription validation, and optional image conditioning.
Handles per-scene HD video generation including prompt enhancement
and subscription validation.
"""
from services.database import get_db as get_db_validation
from services.onboarding.api_key_manager import APIKeyManager
@@ -71,7 +69,6 @@ def generate_hd_video_scene_payload(request: Any, user_id: str) -> Dict[str, Any
scene_number = request.scene_number
logger.info(f"[StoryWriter] Generating HD video for scene {scene_number} for user {user_id}")
# Step 1: Validate API key
hf_token = APIKeyManager().get_api_key("hf_token")
if not hf_token:
logger.error("[StoryWriter] Pre-flight: HF token not configured - blocking video generation")
@@ -83,7 +80,6 @@ def generate_hd_video_scene_payload(request: Any, user_id: str) -> Dict[str, Any
},
)
# Step 2: Subscription limits
db_validation = next(get_db_validation())
try:
pricing_service = PricingService(db_validation)
@@ -93,7 +89,6 @@ def generate_hd_video_scene_payload(request: Any, user_id: str) -> Dict[str, Any
finally:
db_validation.close()
# Stage 1: Prompt enhancement
enhanced_prompt = enhance_scene_prompt_for_video(
current_scene=request.scene_data,
story_context=request.story_context,
@@ -102,15 +97,6 @@ def generate_hd_video_scene_payload(request: Any, user_id: str) -> Dict[str, Any
)
logger.info(f"[StoryWriter] Generated enhanced prompt ({len(enhanced_prompt)} chars) for scene {scene_number}")
# Stage 2: Optional image reference
scene_image_bytes: Optional[bytes] = None
if getattr(request, "scene_image_url", None):
scene_image_bytes = load_story_image_bytes(request.scene_image_url)
if scene_image_bytes:
logger.info(f"[StoryWriter] Using scene image reference for scene {scene_number}")
else:
logger.warning(f"[StoryWriter] Scene image could not be loaded for scene {scene_number}, falling back to text-only video")
kwargs: Dict[str, Any] = {}
if getattr(request, "model", None):
kwargs["model"] = request.model
@@ -129,7 +115,6 @@ def generate_hd_video_scene_payload(request: Any, user_id: str) -> Dict[str, Any
prompt=enhanced_prompt,
provider=getattr(request, "provider", None) or "huggingface",
user_id=user_id,
input_image_bytes=scene_image_bytes,
**kwargs,
)
@@ -151,4 +136,3 @@ def generate_hd_video_scene_payload(request: Any, user_id: str) -> Dict[str, Any
"model": getattr(request, "model", None) or "tencent/HunyuanVideo",
}

View File

@@ -11,6 +11,8 @@ from loguru import logger
BASE_DIR = Path(__file__).resolve().parents[3] # backend/
STORY_IMAGES_DIR = (BASE_DIR / "story_images").resolve()
STORY_IMAGES_DIR.mkdir(parents=True, exist_ok=True)
STORY_AUDIO_DIR = (BASE_DIR / "story_audio").resolve()
STORY_AUDIO_DIR.mkdir(parents=True, exist_ok=True)
def load_story_image_bytes(image_url: str) -> Optional[bytes]:
@@ -48,6 +50,41 @@ def load_story_image_bytes(image_url: str) -> Optional[bytes]:
return None
def load_story_audio_bytes(audio_url: str) -> Optional[bytes]:
"""
Resolve an authenticated story audio URL (e.g., /api/story/audio/<file>) to raw bytes.
Returns None if the file cannot be located.
"""
if not audio_url:
return None
try:
parsed = urlparse(audio_url)
path = parsed.path if parsed.scheme else audio_url
prefix = "/api/story/audio/"
if prefix not in path:
logger.warning(f"[StoryWriter] Unsupported audio URL for video reference: {audio_url}")
return None
filename = path.split(prefix, 1)[1].split("?", 1)[0].strip()
if not filename:
return None
file_path = (STORY_AUDIO_DIR / filename).resolve()
if not str(file_path).startswith(str(STORY_AUDIO_DIR)):
logger.error(f"[StoryWriter] Attempted path traversal when resolving audio: {audio_url}")
return None
if not file_path.exists():
logger.warning(f"[StoryWriter] Referenced scene audio not found on disk: {file_path}")
return None
return file_path.read_bytes()
except Exception as exc:
logger.error(f"[StoryWriter] Failed to load reference audio for video gen: {exc}")
return None
def resolve_media_file(base_dir: Path, filename: str) -> Path:
"""
Returns a safe resolved path for a media file stored under base_dir.
@@ -62,8 +99,50 @@ def resolve_media_file(base_dir: Path, filename: str) -> Path:
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Access denied")
if not resolved.exists():
alternate = _find_alternate_media_file(base_dir, filename)
if alternate:
logger.warning(
"[StoryWriter] Requested media file '%s' missing; serving closest match '%s'",
filename,
alternate.name,
)
return alternate
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"File not found: {filename}")
return resolved
def _find_alternate_media_file(base_dir: Path, filename: str) -> Optional[Path]:
"""
Attempt to find the most recent media file that matches the original name prefix.
This helps when files are regenerated with new UUID/hash suffixes but the frontend still
references an older filename.
"""
try:
base_dir = base_dir.resolve()
except Exception:
return None
stem = Path(filename).stem
suffix = Path(filename).suffix
if not suffix or "_" not in stem:
return None
prefix = stem.rsplit("_", 1)[0]
pattern = f"{prefix}_*{suffix}"
try:
candidates = sorted(
(p for p in base_dir.glob(pattern) if p.is_file()),
key=lambda p: p.stat().st_mtime,
reverse=True,
)
except Exception as exc:
logger.debug(f"[StoryWriter] Failed to search alternate media files for {filename}: {exc}")
return None
return candidates[0] if candidates else None

View File

@@ -4,6 +4,7 @@ Provides endpoints for subscription management and usage monitoring.
"""
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy.orm import Session
from sqlalchemy import desc, func
from typing import Dict, Any, Optional, List
@@ -116,6 +117,7 @@ async def get_subscription_plans(
"stability_calls": plan.stability_calls_limit,
"video_calls": getattr(plan, 'video_calls_limit', 0),
"image_edit_calls": getattr(plan, 'image_edit_calls_limit', 0),
"audio_calls": getattr(plan, 'audio_calls_limit', 0),
"gemini_tokens": plan.gemini_tokens_limit,
"openai_tokens": plan.openai_tokens_limit,
"anthropic_tokens": plan.anthropic_tokens_limit,
@@ -134,7 +136,7 @@ async def get_subscription_plans(
except (sqlite3.OperationalError, Exception) as e:
error_str = str(e).lower()
if 'no such column' in error_str and ('exa_calls_limit' in error_str or 'video_calls_limit' in error_str or 'image_edit_calls_limit' in error_str):
if 'no such column' in error_str and ('exa_calls_limit' in error_str or 'video_calls_limit' in error_str or 'image_edit_calls_limit' in error_str or 'audio_calls_limit' in error_str):
logger.warning("Missing column detected in subscription plans query, attempting schema fix...")
try:
import services.subscription.schema_utils as schema_utils
@@ -241,6 +243,7 @@ async def get_user_subscription(
"stability_calls": free_plan.stability_calls_limit,
"video_calls": getattr(free_plan, 'video_calls_limit', 0),
"image_edit_calls": getattr(free_plan, 'image_edit_calls_limit', 0),
"audio_calls": getattr(free_plan, 'audio_calls_limit', 0),
"monthly_cost": free_plan.monthly_cost_limit
}
}
@@ -340,6 +343,7 @@ async def get_subscription_status(
"stability_calls": free_plan.stability_calls_limit,
"video_calls": getattr(free_plan, 'video_calls_limit', 0),
"image_edit_calls": getattr(free_plan, 'image_edit_calls_limit', 0),
"audio_calls": getattr(free_plan, 'audio_calls_limit', 0),
"monthly_cost": free_plan.monthly_cost_limit
}
}
@@ -405,7 +409,7 @@ async def get_subscription_status(
except (sqlite3.OperationalError, Exception) as e:
error_str = str(e).lower()
if 'no such column' in error_str and ('exa_calls_limit' in error_str or 'video_calls_limit' in error_str or 'image_edit_calls_limit' in error_str):
if 'no such column' in error_str and ('exa_calls_limit' in error_str or 'video_calls_limit' in error_str or 'image_edit_calls_limit' in error_str or 'audio_calls_limit' in error_str):
# Try to fix schema and retry once
logger.warning("Missing column detected in subscription status query, attempting schema fix...")
try:
@@ -499,6 +503,7 @@ async def get_subscription_status(
"stability_calls": plan.stability_calls_limit,
"video_calls": getattr(plan, 'video_calls_limit', 0),
"image_edit_calls": getattr(plan, 'image_edit_calls_limit', 0),
"audio_calls": getattr(plan, 'audio_calls_limit', 0),
"monthly_cost": plan.monthly_cost_limit
}
}
@@ -988,7 +993,7 @@ async def get_dashboard_data(
except (sqlite3.OperationalError, Exception) as e:
error_str = str(e).lower()
if 'no such column' in error_str and ('exa_calls' in error_str or 'exa_cost' in error_str or 'video_calls' in error_str or 'video_cost' in error_str or 'image_edit_calls' in error_str or 'image_edit_cost' in error_str):
if 'no such column' in error_str and ('exa_calls' in error_str or 'exa_cost' in error_str or 'video_calls' in error_str or 'video_cost' in error_str or 'image_edit_calls' in error_str or 'image_edit_cost' in error_str or 'audio_calls' in error_str or 'audio_cost' in error_str):
logger.warning("Missing column detected in dashboard query, attempting schema fix...")
try:
import services.subscription.schema_utils as schema_utils
@@ -1271,4 +1276,235 @@ async def get_usage_logs(
raise
except Exception as e:
logger.error(f"Error getting usage logs: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Failed to get usage logs: {str(e)}")
raise HTTPException(status_code=500, detail=f"Failed to get usage logs: {str(e)}")
class PreflightOperationRequest(BaseModel):
"""Request model for pre-flight check operation."""
provider: str
model: Optional[str] = None
tokens_requested: Optional[int] = 0
operation_type: str
actual_provider_name: Optional[str] = None
class PreflightCheckRequest(BaseModel):
"""Request model for pre-flight check."""
operations: List[PreflightOperationRequest]
@router.post("/preflight-check")
async def preflight_check(
request: PreflightCheckRequest,
db: Session = Depends(get_db),
current_user: Dict[str, Any] = Depends(get_current_user)
) -> Dict[str, Any]:
"""
Pre-flight check for operations with cost estimation.
Lightweight endpoint that:
- Validates if operations are allowed based on subscription limits
- Estimates cost for operations
- Returns usage information and remaining quota
Uses caching to minimize DB load (< 100ms with cache hit).
"""
try:
user_id = str(current_user.get('id', ''))
if not user_id:
raise HTTPException(status_code=401, detail="Invalid user ID in authentication token")
# Ensure schema columns exist
try:
ensure_subscription_plan_columns(db)
ensure_usage_summaries_columns(db)
except Exception as schema_err:
logger.warning(f"Schema check failed: {schema_err}")
pricing_service = PricingService(db)
# Convert request operations to internal format
operations_to_validate = []
for op in request.operations:
try:
# Map provider string to APIProvider enum
provider_str = op.provider.lower()
if provider_str == "huggingface":
provider_enum = APIProvider.MISTRAL # Maps to HuggingFace
elif provider_str == "video":
provider_enum = APIProvider.VIDEO
elif provider_str == "image_edit":
provider_enum = APIProvider.IMAGE_EDIT
elif provider_str == "stability":
provider_enum = APIProvider.STABILITY
elif provider_str == "audio":
provider_enum = APIProvider.AUDIO
else:
try:
provider_enum = APIProvider(provider_str)
except ValueError:
logger.warning(f"Unknown provider: {provider_str}, skipping")
continue
operations_to_validate.append({
'provider': provider_enum,
'tokens_requested': op.tokens_requested or 0,
'actual_provider_name': op.actual_provider_name or op.provider,
'operation_type': op.operation_type
})
except Exception as e:
logger.warning(f"Error processing operation {op.operation_type}: {e}")
continue
if not operations_to_validate:
raise HTTPException(status_code=400, detail="No valid operations provided")
# Perform pre-flight validation
can_proceed, message, error_details = pricing_service.check_comprehensive_limits(
user_id=user_id,
operations=operations_to_validate
)
# Get pricing and cost estimation for each operation
operation_results = []
total_cost = 0.0
for i, op in enumerate(operations_to_validate):
op_result = {
'provider': op['actual_provider_name'],
'operation_type': op['operation_type'],
'cost': 0.0,
'allowed': can_proceed,
'limit_info': None,
'message': None
}
# Get pricing for this operation
model_name = request.operations[i].model
if model_name:
pricing_info = pricing_service.get_pricing_for_provider_model(
op['provider'],
model_name
)
if pricing_info:
# Determine cost based on operation type
if op['provider'] in [APIProvider.VIDEO, APIProvider.IMAGE_EDIT, APIProvider.STABILITY]:
cost = pricing_info.get('cost_per_request', 0.0) or pricing_info.get('cost_per_image', 0.0) or 0.0
elif op['provider'] == APIProvider.AUDIO:
# Audio pricing is per character (every character is 1 token)
cost = (pricing_info.get('cost_per_input_token', 0.0) or 0.0) * (op['tokens_requested'] / 1000.0)
elif op['tokens_requested'] > 0:
# Token-based cost estimation (rough estimate)
cost = (pricing_info.get('cost_per_input_token', 0.0) or 0.0) * (op['tokens_requested'] / 1000)
else:
cost = pricing_info.get('cost_per_request', 0.0) or 0.0
op_result['cost'] = round(cost, 4)
total_cost += cost
else:
# Use default cost if pricing not found
if op['provider'] == APIProvider.VIDEO:
op_result['cost'] = 0.10 # Default video cost
total_cost += 0.10
elif op['provider'] == APIProvider.IMAGE_EDIT:
op_result['cost'] = 0.05 # Default image edit cost
total_cost += 0.05
elif op['provider'] == APIProvider.STABILITY:
op_result['cost'] = 0.04 # Default image generation cost
total_cost += 0.04
elif op['provider'] == APIProvider.AUDIO:
# Default audio cost: $0.05 per 1,000 characters
cost = (op['tokens_requested'] / 1000.0) * 0.05
op_result['cost'] = round(cost, 4)
total_cost += cost
# Get limit information
limit_info = None
if error_details and not can_proceed:
usage_info = error_details.get('usage_info', {})
if usage_info:
op_result['message'] = message
limit_info = {
'current_usage': usage_info.get('current_usage', 0),
'limit': usage_info.get('limit', 0),
'remaining': max(0, usage_info.get('limit', 0) - usage_info.get('current_usage', 0))
}
op_result['limit_info'] = limit_info
else:
# Get current usage for this provider
limits = pricing_service.get_user_limits(user_id)
if limits:
usage_summary = db.query(UsageSummary).filter(
UsageSummary.user_id == user_id,
UsageSummary.billing_period == pricing_service.get_current_billing_period(user_id)
).first()
if usage_summary:
if op['provider'] == APIProvider.VIDEO:
current = getattr(usage_summary, 'video_calls', 0) or 0
limit = limits['limits'].get('video_calls', 0)
elif op['provider'] == APIProvider.IMAGE_EDIT:
current = getattr(usage_summary, 'image_edit_calls', 0) or 0
limit = limits['limits'].get('image_edit_calls', 0)
elif op['provider'] == APIProvider.STABILITY:
current = getattr(usage_summary, 'stability_calls', 0) or 0
limit = limits['limits'].get('stability_calls', 0)
elif op['provider'] == APIProvider.AUDIO:
current = getattr(usage_summary, 'audio_calls', 0) or 0
limit = limits['limits'].get('audio_calls', 0)
else:
# For LLM providers, use token limits
provider_key = op['provider'].value
current_tokens = getattr(usage_summary, f"{provider_key}_tokens", 0) or 0
limit = limits['limits'].get(f"{provider_key}_tokens", 0)
current = current_tokens
limit_info = {
'current_usage': current,
'limit': limit,
'remaining': max(0, limit - current) if limit > 0 else float('inf')
}
op_result['limit_info'] = limit_info
operation_results.append(op_result)
# Get overall usage summary
limits = pricing_service.get_user_limits(user_id)
usage_summary = None
if limits:
usage_summary = db.query(UsageSummary).filter(
UsageSummary.user_id == user_id,
UsageSummary.billing_period == pricing_service.get_current_billing_period(user_id)
).first()
response_data = {
'can_proceed': can_proceed,
'estimated_cost': round(total_cost, 4),
'operations': operation_results,
'total_cost': round(total_cost, 4),
'usage_summary': None,
'cached': False # TODO: Track if result was cached
}
if usage_summary and limits:
# For video generation, show video limits
video_current = getattr(usage_summary, 'video_calls', 0) or 0
video_limit = limits['limits'].get('video_calls', 0)
response_data['usage_summary'] = {
'current_calls': video_current,
'limit': video_limit,
'remaining': max(0, video_limit - video_current) if video_limit > 0 else float('inf')
}
return {
"success": True,
"data": response_data
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error in pre-flight check: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Pre-flight check failed: {str(e)}")

View File

@@ -97,7 +97,14 @@ def setup_clean_logging():
def video_generation_filter(record):
msg = record.get("message", "")
name = record.get("name", "")
return "[StoryVideoGeneration]" in msg or "services.story_writer.video_generation_service" in name
service = record.get("extra", {}).get("service")
return (
"[StoryVideoGeneration]" in msg
or "services.story_writer.video_generation_service" in name
or "[video_gen]" in msg
or service == "video_generation_service"
or "services.llm_providers.main_video_generation" in name
)
logger.add(
sys.stdout.write,
level="INFO",

View File

@@ -2,7 +2,7 @@
import os
from typing import Optional, Dict, Any
from fastapi import HTTPException, Depends, status
from fastapi import HTTPException, Depends, status, Request, Query
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from loguru import logger
from dotenv import load_dotenv
@@ -259,3 +259,63 @@ async def get_optional_user(
except Exception as e:
logger.warning(f"Optional authentication failed: {e}")
return None
async def get_current_user_with_query_token(
request: Request,
credentials: Optional[HTTPAuthorizationCredentials] = Depends(security)
) -> Dict[str, Any]:
"""Get current authenticated user from either Authorization header or query parameter.
This is useful for media endpoints (audio, video, images) that need to be accessed
by HTML elements like <audio> or <img> which cannot send custom headers.
Args:
request: FastAPI request object
credentials: HTTP authorization credentials from header
Returns:
User dictionary with authentication info
Raises:
HTTPException: If authentication fails
"""
try:
# Try to get token from Authorization header first
token_to_verify = None
if credentials:
token_to_verify = credentials.credentials
else:
# Fall back to query parameter if no header
query_token = request.query_params.get("token")
if query_token:
token_to_verify = query_token
if not token_to_verify:
logger.warning("No credentials provided (neither header nor query parameter)")
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Not authenticated",
headers={"WWW-Authenticate": "Bearer"},
)
user = await clerk_auth.verify_token(token_to_verify)
if not user:
# Token verification failed (likely expired) - log at debug level to reduce noise
logger.debug("Token verification failed (likely expired token)")
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Authentication failed",
headers={"WWW-Authenticate": "Bearer"},
)
return user
except HTTPException:
raise
except Exception as e:
logger.error(f"Authentication error: {e}")
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Authentication failed",
headers={"WWW-Authenticate": "Bearer"},
)

View File

@@ -207,6 +207,32 @@ class StoryImageGenerationResponse(BaseModel):
task_id: Optional[str] = Field(None, description="Task ID for async operations")
class RegenerateImageRequest(BaseModel):
"""Request model for regenerating a single scene image with a direct prompt."""
scene_number: int = Field(..., description="Scene number to regenerate image for")
scene_title: str = Field(..., description="Scene title")
prompt: str = Field(..., description="Direct prompt to use for image generation (no AI prompt generation)")
provider: Optional[str] = Field(None, description="Image generation provider (gemini, huggingface, stability)")
width: Optional[int] = Field(1024, description="Image width")
height: Optional[int] = Field(1024, description="Image height")
model: Optional[str] = Field(None, description="Model to use for image generation")
class RegenerateImageResponse(BaseModel):
"""Response model for regenerated image."""
scene_number: int = Field(..., description="Scene number")
scene_title: str = Field(..., description="Scene title")
image_filename: str = Field(..., description="Generated image filename")
image_url: str = Field(..., description="Image URL")
width: int = Field(..., description="Image width")
height: int = Field(..., description="Image height")
provider: str = Field(..., description="Provider used")
model: Optional[str] = Field(None, description="Model used")
seed: Optional[int] = Field(None, description="Seed used")
success: bool = Field(default=True, description="Whether the generation was successful")
error: Optional[str] = Field(None, description="Error message if generation failed")
class StoryAudioGenerationRequest(BaseModel):
"""Request model for audio generation."""
scenes: List[StoryScene] = Field(..., description="List of scenes to generate audio for")
@@ -234,11 +260,41 @@ class StoryAudioGenerationResponse(BaseModel):
task_id: Optional[str] = Field(None, description="Task ID for async operations")
class GenerateAIAudioRequest(BaseModel):
"""Request model for generating AI audio for a single scene."""
scene_number: int = Field(..., description="Scene number to generate audio for")
scene_title: str = Field(..., description="Scene title")
text: str = Field(..., description="Text to convert to speech")
voice_id: Optional[str] = Field("Wise_Woman", description="Voice ID for AI audio generation")
speed: Optional[float] = Field(1.0, description="Speech speed (0.5-2.0)")
volume: Optional[float] = Field(1.0, description="Speech volume (0.1-10.0)")
pitch: Optional[float] = Field(0.0, description="Speech pitch (-12 to 12)")
emotion: Optional[str] = Field("happy", description="Emotion for speech")
class GenerateAIAudioResponse(BaseModel):
"""Response model for AI audio generation."""
scene_number: int = Field(..., description="Scene number")
scene_title: str = Field(..., description="Scene title")
audio_filename: str = Field(..., description="Generated audio filename")
audio_url: str = Field(..., description="Audio URL")
provider: str = Field(..., description="Provider used (wavespeed)")
model: str = Field(..., description="Model used (minimax/speech-02-hd)")
voice_id: str = Field(..., description="Voice ID used")
text_length: int = Field(..., description="Number of characters in text")
file_size: int = Field(..., description="Audio file size in bytes")
cost: float = Field(..., description="Cost of generation")
success: bool = Field(default=True, description="Whether the generation was successful")
error: Optional[str] = Field(None, description="Error message if generation failed")
class StoryVideoGenerationRequest(BaseModel):
"""Request model for video generation."""
scenes: List[StoryScene] = Field(..., description="List of scenes to generate video for")
image_urls: List[str] = Field(..., description="List of image URLs for each scene")
audio_urls: List[str] = Field(..., description="List of audio URLs for each scene")
video_urls: Optional[List[Optional[str]]] = Field(None, description="Optional list of animated video URLs (preferred over images)")
ai_audio_urls: Optional[List[Optional[str]]] = Field(None, description="Optional list of AI audio URLs (preferred over free audio)")
story_title: Optional[str] = Field(default="Story", description="Title of the story")
fps: Optional[int] = Field(default=24, description="Frames per second for video")
transition_duration: Optional[float] = Field(default=0.5, description="Duration of transitions between scenes")
@@ -260,3 +316,39 @@ class StoryVideoGenerationResponse(BaseModel):
video: StoryVideoResult = Field(..., description="Generated video")
success: bool = Field(default=True, description="Whether the generation was successful")
task_id: Optional[str] = Field(None, description="Task ID for async operations")
class AnimateSceneRequest(BaseModel):
"""Request model for per-scene animation preview."""
scene_number: int = Field(..., description="Scene number to animate")
scene_data: Dict[str, Any] = Field(..., description="Scene data payload")
story_context: Dict[str, Any] = Field(..., description="Story-wide context used for prompts")
image_url: str = Field(..., description="Relative URL to the generated scene image")
duration: int = Field(default=5, description="Animation duration (5 or 10 seconds)")
class AnimateSceneVoiceoverRequest(AnimateSceneRequest):
"""Request model for WaveSpeed InfiniteTalk animation."""
audio_url: str = Field(..., description="Relative URL to the generated scene audio")
resolution: Optional[str] = Field("720p", description="Output resolution ('480p' or '720p')")
prompt: Optional[str] = Field(None, description="Optional positive prompt override")
class AnimateSceneResponse(BaseModel):
"""Response model for scene animation preview."""
success: bool = Field(default=True, description="Whether the animation succeeded")
scene_number: int = Field(..., description="Scene number animated")
video_filename: str = Field(..., description="Stored video filename")
video_url: str = Field(..., description="API URL to access the animated video")
duration: int = Field(..., description="Duration of the animation")
cost: float = Field(..., description="Cost billed for the animation")
prompt_used: str = Field(..., description="Animation prompt passed to the model")
provider: str = Field(default="wavespeed", description="Underlying provider used")
prediction_id: Optional[str] = Field(None, description="WaveSpeed prediction ID for resume operations")
class ResumeSceneAnimationRequest(BaseModel):
"""Request model to resume scene animation download."""
prediction_id: str = Field(..., description="WaveSpeed prediction ID to resume from")
scene_number: int = Field(..., description="Scene number being resumed")
duration: int = Field(default=5, description="Animation duration (5 or 10 seconds)")

View File

@@ -37,6 +37,7 @@ class APIProvider(enum.Enum):
EXA = "exa"
VIDEO = "video"
IMAGE_EDIT = "image_edit"
AUDIO = "audio"
class BillingCycle(enum.Enum):
MONTHLY = "monthly"
@@ -72,6 +73,7 @@ class SubscriptionPlan(Base):
exa_calls_limit = Column(Integer, default=0) # Exa neural search
video_calls_limit = Column(Integer, default=0) # AI video generation
image_edit_calls_limit = Column(Integer, default=0) # AI image editing
audio_calls_limit = Column(Integer, default=0) # AI audio generation (text-to-speech)
# Token Limits (for LLM providers)
gemini_tokens_limit = Column(Integer, default=0)
@@ -191,6 +193,7 @@ class UsageSummary(Base):
exa_calls = Column(Integer, default=0)
video_calls = Column(Integer, default=0) # AI video generation
image_edit_calls = Column(Integer, default=0) # AI image editing
audio_calls = Column(Integer, default=0) # AI audio generation (text-to-speech)
# Token Usage
gemini_tokens = Column(Integer, default=0)
@@ -211,6 +214,7 @@ class UsageSummary(Base):
exa_cost = Column(Float, default=0.0)
video_cost = Column(Float, default=0.0) # AI video generation
image_edit_cost = Column(Float, default=0.0) # AI image editing
audio_cost = Column(Float, default=0.0) # AI audio generation (text-to-speech)
# Totals
total_calls = Column(Integer, default=0)

View File

@@ -0,0 +1,301 @@
"""
Main Audio Generation Service for ALwrity Backend.
This service provides AI-powered text-to-speech functionality using WaveSpeed Minimax Speech 02 HD.
"""
from __future__ import annotations
import sys
from typing import Optional, Dict, Any
from datetime import datetime
from loguru import logger
from fastapi import HTTPException
from services.wavespeed.client import WaveSpeedClient
from services.onboarding.api_key_manager import APIKeyManager
from utils.logger_utils import get_service_logger
logger = get_service_logger("audio_generation")
class AudioGenerationResult:
"""Result of audio generation."""
def __init__(
self,
audio_bytes: bytes,
provider: str,
model: str,
voice_id: str,
text_length: int,
file_size: int,
):
self.audio_bytes = audio_bytes
self.provider = provider
self.model = model
self.voice_id = voice_id
self.text_length = text_length
self.file_size = file_size
def generate_audio(
text: str,
voice_id: str = "Wise_Woman",
speed: float = 1.0,
volume: float = 1.0,
pitch: float = 0.0,
emotion: str = "happy",
user_id: Optional[str] = None,
**kwargs
) -> AudioGenerationResult:
"""
Generate audio using AI text-to-speech with subscription tracking.
Args:
text: Text to convert to speech (max 10000 characters)
voice_id: Voice ID (default: "Wise_Woman")
speed: Speech speed (0.5-2.0, default: 1.0)
volume: Speech volume (0.1-10.0, default: 1.0)
pitch: Speech pitch (-12 to 12, default: 0.0)
emotion: Emotion (default: "happy")
user_id: User ID for subscription checking (required)
**kwargs: Additional parameters (sample_rate, bitrate, format, etc.)
Returns:
AudioGenerationResult: Generated audio result
Raises:
RuntimeError: If subscription limits are exceeded or user_id is missing.
"""
try:
logger.info("[audio_gen] Starting audio generation")
logger.debug(f"[audio_gen] Text length: {len(text)} characters, voice: {voice_id}")
# SUBSCRIPTION CHECK - Required and strict enforcement
if not user_id:
raise RuntimeError("user_id is required for subscription checking. Please provide Clerk user ID.")
# Calculate cost based on character count (every character is 1 token)
# Pricing: $0.05 per 1,000 characters
character_count = len(text)
cost_per_1000_chars = 0.05
estimated_cost = (character_count / 1000.0) * cost_per_1000_chars
try:
from services.database import get_db
from services.subscription import PricingService
from models.subscription_models import UsageSummary, APIProvider
db = next(get_db())
try:
pricing_service = PricingService(db)
# Check limits using sync method from pricing service (strict enforcement)
# Use AUDIO provider for audio generation
can_proceed, message, usage_info = pricing_service.check_usage_limits(
user_id=user_id,
provider=APIProvider.AUDIO,
tokens_requested=character_count, # Use character count as "tokens" for audio
actual_provider_name="wavespeed" # Actual provider is WaveSpeed
)
if not can_proceed:
logger.warning(f"[audio_gen] Subscription limit exceeded for user {user_id}: {message}")
error_detail = {
'error': message,
'message': message,
'provider': 'wavespeed',
'usage_info': usage_info if usage_info else {}
}
raise HTTPException(status_code=429, detail=error_detail)
# Get current usage for limit checking
current_period = pricing_service.get_current_billing_period(user_id) or datetime.now().strftime("%Y-%m")
usage = db.query(UsageSummary).filter(
UsageSummary.user_id == user_id,
UsageSummary.billing_period == current_period
).first()
finally:
db.close()
except HTTPException:
raise
except RuntimeError:
raise
except Exception as sub_error:
logger.error(f"[audio_gen] Subscription check failed for user {user_id}: {sub_error}")
raise RuntimeError(f"Subscription check failed: {str(sub_error)}")
# Generate audio using WaveSpeed
try:
client = WaveSpeedClient()
audio_bytes = client.generate_speech(
text=text,
voice_id=voice_id,
speed=speed,
volume=volume,
pitch=pitch,
emotion=emotion,
enable_sync_mode=True,
**kwargs
)
logger.info(f"[audio_gen] ✅ API call successful, generated {len(audio_bytes)} bytes")
except HTTPException:
raise
except Exception as api_error:
logger.error(f"[audio_gen] Audio generation API failed: {api_error}")
raise HTTPException(
status_code=502,
detail={
"error": "Audio generation failed",
"message": str(api_error)
}
)
# TRACK USAGE after successful API call
if audio_bytes:
logger.info(f"[audio_gen] ✅ API call successful, tracking usage for user {user_id}")
try:
db_track = next(get_db())
try:
from models.subscription_models import UsageSummary, APIUsageLog, APIProvider
from services.subscription import PricingService
pricing = PricingService(db_track)
current_period = pricing.get_current_billing_period(user_id) or datetime.now().strftime("%Y-%m")
# Get or create usage summary
summary = db_track.query(UsageSummary).filter(
UsageSummary.user_id == user_id,
UsageSummary.billing_period == current_period
).first()
if not summary:
summary = UsageSummary(
user_id=user_id,
billing_period=current_period
)
db_track.add(summary)
db_track.flush()
# Get current values before update
current_calls_before = getattr(summary, "audio_calls", 0) or 0
current_cost_before = getattr(summary, "audio_cost", 0.0) or 0.0
# Update audio calls and cost
new_calls = current_calls_before + 1
new_cost = current_cost_before + estimated_cost
# Use direct SQL UPDATE for dynamic attributes
from sqlalchemy import text
update_query = text("""
UPDATE usage_summaries
SET audio_calls = :new_calls,
audio_cost = :new_cost
WHERE user_id = :user_id AND billing_period = :period
""")
db_track.execute(update_query, {
'new_calls': new_calls,
'new_cost': new_cost,
'user_id': user_id,
'period': current_period
})
# Update total cost
summary.total_cost = (summary.total_cost or 0.0) + estimated_cost
summary.total_calls = (summary.total_calls or 0) + 1
summary.updated_at = datetime.utcnow()
# Create usage log
usage_log = APIUsageLog(
user_id=user_id,
provider=APIProvider.AUDIO,
endpoint="/audio-generation/wavespeed",
method="POST",
model_used="minimax/speech-02-hd",
tokens_input=character_count,
tokens_output=0,
tokens_total=character_count,
cost_input=0.0,
cost_output=0.0,
cost_total=estimated_cost,
response_time=0.0,
status_code=200,
request_size=len(text.encode("utf-8")),
response_size=len(audio_bytes),
billing_period=current_period,
)
db_track.add(usage_log)
# Get plan details for unified log
limits = pricing.get_user_limits(user_id)
plan_name = limits.get('plan_name', 'unknown') if limits else 'unknown'
tier = limits.get('tier', 'unknown') if limits else 'unknown'
audio_limit = limits['limits'].get("audio_calls", 0) if limits else 0
# Only show ∞ for Enterprise tier when limit is 0 (unlimited)
audio_limit_display = audio_limit if (audio_limit > 0 or tier != 'enterprise') else ''
# Get related stats for unified log
current_image_calls = getattr(summary, "stability_calls", 0) or 0
image_limit = limits['limits'].get("stability_calls", 0) if limits else 0
current_image_edit_calls = getattr(summary, "image_edit_calls", 0) or 0
image_edit_limit = limits['limits'].get("image_edit_calls", 0) if limits else 0
current_video_calls = getattr(summary, "video_calls", 0) or 0
video_limit = limits['limits'].get("video_calls", 0) if limits else 0
db_track.commit()
logger.info(f"[audio_gen] ✅ Successfully tracked usage: user {user_id} -> audio -> {new_calls} calls, ${estimated_cost:.4f}")
# UNIFIED SUBSCRIPTION LOG - Shows before/after state in one message
print(f"""
[SUBSCRIPTION] Audio Generation
├─ User: {user_id}
├─ Plan: {plan_name} ({tier})
├─ Provider: wavespeed
├─ Actual Provider: wavespeed
├─ Model: minimax/speech-02-hd
├─ Voice: {voice_id}
├─ Calls: {current_calls_before}{new_calls} / {audio_limit_display}
├─ Cost: ${current_cost_before:.4f} → ${new_cost:.4f}
├─ Characters: {character_count}
├─ Images: {current_image_calls} / {image_limit if image_limit > 0 else ''}
├─ Image Editing: {current_image_edit_calls} / {image_edit_limit if image_edit_limit > 0 else ''}
├─ Videos: {current_video_calls} / {video_limit if video_limit > 0 else ''}
└─ Status: ✅ Allowed & Tracked
""", flush=True)
sys.stdout.flush()
except Exception as track_error:
logger.error(f"[audio_gen] ❌ Error tracking usage (non-blocking): {track_error}", exc_info=True)
db_track.rollback()
finally:
db_track.close()
except Exception as usage_error:
logger.error(f"[audio_gen] ❌ Failed to track usage: {usage_error}", exc_info=True)
return AudioGenerationResult(
audio_bytes=audio_bytes,
provider="wavespeed",
model="minimax/speech-02-hd",
voice_id=voice_id,
text_length=character_count,
file_size=len(audio_bytes),
)
except HTTPException:
raise
except RuntimeError:
raise
except Exception as e:
logger.error(f"[audio_gen] Error generating audio: {e}")
raise HTTPException(
status_code=500,
detail={
"error": "Audio generation failed",
"message": str(e)
}
)

View File

@@ -515,6 +515,12 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct:
current_video_calls = getattr(summary, "video_calls", 0) or 0
video_limit = limits['limits'].get("video_calls", 0) if limits else 0
# Get audio stats for unified log
current_audio_calls = getattr(summary, "audio_calls", 0) or 0
audio_limit = limits['limits'].get("audio_calls", 0) if limits else 0
# Only show ∞ for Enterprise tier when limit is 0 (unlimited)
audio_limit_display = audio_limit if (audio_limit > 0 or tier != 'enterprise') else ''
# CRITICAL DEBUG: Print diagnostic info BEFORE commit (always visible, flushed immediately)
import sys
debug_msg = f"[DEBUG] BEFORE COMMIT - Record count: {record_count}, Raw SQL values: calls={current_calls_before}, tokens={current_tokens_before}, Provider: {provider_name}, Period: {current_period}, New calls will be: {new_calls}, New tokens will be: {new_tokens}"
@@ -571,6 +577,8 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct:
├─ Tokens: {current_tokens_before}{new_tokens} / {token_limit if token_limit > 0 else ''}
├─ Images: {current_images_before} / {image_limit if image_limit > 0 else ''}
├─ Image Editing: {current_image_edit_calls} / {image_edit_limit if image_edit_limit > 0 else ''}
├─ Videos: {current_video_calls} / {video_limit if video_limit > 0 else ''}
├─ Audio: {current_audio_calls} / {audio_limit_display}
└─ Status: ✅ Allowed & Tracked
""")
except Exception as track_error:
@@ -819,6 +827,12 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct:
current_video_calls = getattr(summary, "video_calls", 0) or 0
video_limit = limits['limits'].get("video_calls", 0) if limits else 0
# Get audio stats for unified log
current_audio_calls = getattr(summary, "audio_calls", 0) or 0
audio_limit = limits['limits'].get("audio_calls", 0) if limits else 0
# Only show ∞ for Enterprise tier when limit is 0 (unlimited)
audio_limit_display = audio_limit if (audio_limit > 0 or tier != 'enterprise') else ''
# CRITICAL: Flush before commit to ensure changes are immediately visible to other sessions
db_track.flush() # Flush to ensure changes are in DB (not just in transaction)
db_track.commit() # Commit transaction to make changes visible to other sessions
@@ -838,6 +852,7 @@ def llm_text_gen(prompt: str, system_prompt: Optional[str] = None, json_struct:
├─ Images: {current_images_before} / {image_limit if image_limit > 0 else ''}
├─ Image Editing: {current_image_edit_calls} / {image_edit_limit if image_edit_limit > 0 else ''}
├─ Videos: {current_video_calls} / {video_limit if video_limit > 0 else ''}
├─ Audio: {current_audio_calls} / {audio_limit_display}
└─ Status: ✅ Allowed & Tracked
""")
except Exception as track_error:

View File

@@ -10,6 +10,7 @@ from __future__ import annotations
import os
import base64
import io
import sys
from typing import Any, Dict, Optional, Union
from fastapi import HTTPException
@@ -22,11 +23,11 @@ except ImportError:
InferenceClient = None
from ..onboarding.api_key_manager import APIKeyManager
from services.subscription import PricingService
from utils.logger_utils import get_service_logger
logger = get_service_logger("video_generation_service")
class VideoProviderNotImplemented(Exception):
pass
@@ -48,44 +49,80 @@ def _get_api_key(provider: str) -> Optional[str]:
def _coerce_video_bytes(output: Any) -> bytes:
"""
Normalizes the different return shapes that huggingface_hub may emit for video tasks.
Depending on the provider/library version we may get:
- raw bytes
- an object with `.video` or `.bytes` attributes (plus optional `.save`)
- a dict containing a `video` key with bytes/base64 data
According to HF docs, text_to_video() should return bytes directly.
"""
data: Union[bytes, bytearray, memoryview, io.BufferedIOBase, None] = None
logger.debug(f"[video_gen] _coerce_video_bytes received type: {type(output)}")
# Most common case: bytes directly
if isinstance(output, (bytes, bytearray, memoryview)):
logger.debug(f"[video_gen] Output is bytes: {len(output)} bytes")
return bytes(output)
# Handle file-like objects
if hasattr(output, "read"):
logger.debug("[video_gen] Output has read() method, reading...")
data = output.read()
if isinstance(data, (bytes, bytearray, memoryview)):
return bytes(data)
raise TypeError(f"File-like object returned non-bytes: {type(data)}")
# Objects with direct attribute access
if hasattr(output, "video"):
logger.debug("[video_gen] Output has 'video' attribute")
data = getattr(output, "video")
elif hasattr(output, "bytes"):
if isinstance(data, (bytes, bytearray, memoryview)):
return bytes(data)
if hasattr(data, "read"):
return bytes(data.read())
if hasattr(output, "bytes"):
logger.debug("[video_gen] Output has 'bytes' attribute")
data = getattr(output, "bytes")
elif isinstance(output, dict) and "video" in output:
data = output["video"]
else:
data = output
if isinstance(data, (bytes, bytearray, memoryview)):
return bytes(data)
if hasattr(data, "read"):
return bytes(data.read())
# Handle file-like responses
if hasattr(data, "read"):
data = data.read()
# Dict handling - but this shouldn't happen with text_to_video()
if isinstance(output, dict):
logger.warning(f"[video_gen] Received dict output (unexpected): keys={list(output.keys())}")
# Try to get video key safely - use .get() to avoid KeyError
data = output.get("video")
if data is not None:
if isinstance(data, (bytes, bytearray, memoryview)):
return bytes(data)
if hasattr(data, "read"):
return bytes(data.read())
# Try other common keys
for key in ["data", "content", "file", "result", "output"]:
data = output.get(key)
if data is not None:
if isinstance(data, (bytes, bytearray, memoryview)):
return bytes(data)
if hasattr(data, "read"):
return bytes(data.read())
raise TypeError(f"Dict output has no recognized video key. Keys: {list(output.keys())}")
if isinstance(data, (bytes, bytearray, memoryview)):
return bytes(data)
if isinstance(data, str):
# Expecting data URI or raw base64 string
if data.startswith("data:"):
_, encoded = data.split(",", 1)
# String handling (base64)
if isinstance(output, str):
logger.debug("[video_gen] Output is string, attempting base64 decode")
if output.startswith("data:"):
_, encoded = output.split(",", 1)
return base64.b64decode(encoded)
try:
return base64.b64decode(data)
return base64.b64decode(output)
except Exception as exc:
raise TypeError(f"Unable to decode string video payload: {exc}") from exc
raise TypeError(f"Unsupported video payload type: {type(data)}")
# Fallback: try to use output directly
logger.warning(f"[video_gen] Unexpected output type: {type(output)}, attempting direct conversion")
try:
if hasattr(output, "__bytes__"):
return bytes(output)
except Exception:
pass
raise TypeError(f"Unsupported video payload type: {type(output)}. Output: {str(output)[:200]}")
def _generate_with_huggingface(
@@ -96,7 +133,6 @@ def _generate_with_huggingface(
negative_prompt: Optional[str] = None,
seed: Optional[int] = None,
model: str = "tencent/HunyuanVideo",
input_image_bytes: Optional[bytes] = None,
) -> bytes:
"""
Generates video bytes using Hugging Face's InferenceClient.
@@ -109,7 +145,6 @@ def _generate_with_huggingface(
raise RuntimeError("HF token not configured. Set an hf_token in APIKeyManager.")
client = InferenceClient(
model=model,
provider="fal-ai",
token=token,
)
@@ -126,26 +161,25 @@ def _generate_with_huggingface(
params["seed"] = seed
logger.info(
"[video_gen] HuggingFace request model=%s frames=%s steps=%s mode=%s",
"[video_gen] HuggingFace request model=%s frames=%s steps=%s mode=text-to-video",
model,
num_frames,
num_inference_steps,
"image-to-video" if input_image_bytes else "text-to-video",
)
try:
call_kwargs = {**params, "model": model}
if input_image_bytes:
video_output = client.image_to_video(
image=input_image_bytes,
prompt=prompt,
**call_kwargs,
)
else:
video_output = client.text_to_video(
prompt,
**call_kwargs,
)
logger.info("[video_gen] Calling client.text_to_video()...")
video_output = client.text_to_video(
prompt=prompt,
model=model,
**params,
)
logger.info(f"[video_gen] text_to_video() returned type: {type(video_output)}")
if isinstance(video_output, dict):
logger.info(f"[video_gen] Dict keys: {list(video_output.keys())}")
elif hasattr(video_output, "__dict__"):
logger.info(f"[video_gen] Object attributes: {dir(video_output)}")
video_bytes = _coerce_video_bytes(video_output)
@@ -158,6 +192,15 @@ def _generate_with_huggingface(
logger.info(f"[video_gen] Successfully generated video: {len(video_bytes)} bytes")
return video_bytes
except KeyError as e:
error_msg = str(e)
logger.error(f"[video_gen] HF KeyError: {error_msg}", exc_info=True)
logger.error(f"[video_gen] This suggests the API response format is unexpected. Check logs above for response type.")
raise HTTPException(status_code=502, detail={
"error": f"Hugging Face API returned unexpected response format: {error_msg}",
"error_type": "KeyError",
"hint": "The API response may have changed. Check server logs for details."
})
except Exception as e:
error_msg = str(e)
error_type = type(e).__name__
@@ -179,7 +222,6 @@ def ai_video_generate(
prompt: str,
provider: str = "huggingface",
user_id: Optional[str] = None,
input_image_bytes: Optional[bytes] = None,
**kwargs,
) -> bytes:
"""
@@ -187,7 +229,6 @@ def ai_video_generate(
- provider: 'huggingface' (default), 'gemini' (veo3 stub), 'openai' (sora stub)
- kwargs: num_frames, guidance_scale, num_inference_steps, negative_prompt, seed, model
- input_image_bytes: optional bytes for image-to-video flows (uses image as motion anchor)
Returns raw video bytes (mp4/webm depending on provider).
"""
@@ -200,7 +241,6 @@ def ai_video_generate(
# PRE-FLIGHT VALIDATION: Validate video generation before API call
# MUST happen BEFORE any API calls - return immediately if validation fails
from services.database import get_db
from services.subscription import PricingService
from services.subscription.preflight_validator import validate_video_generation_operations
from fastapi import HTTPException
@@ -227,7 +267,6 @@ def ai_video_generate(
if provider == "huggingface":
video_bytes = _generate_with_huggingface(
prompt=prompt,
input_image_bytes=input_image_bytes,
**kwargs,
)
elif provider == "gemini":
@@ -237,112 +276,14 @@ def ai_video_generate(
else:
raise RuntimeError(f"Unknown video provider: {provider}")
# Track usage AFTER successful generation
db_track = next(get_db())
try:
from models.subscription_models import APIProvider, UsageSummary, APIUsageLog
from datetime import datetime
from services.subscription import PricingService
# Create pricing service for tracking (uses same DB session)
pricing_service_track = PricingService(db_track)
# Get current billing period
current_period = pricing_service_track.get_current_billing_period(user_id) or datetime.now().strftime("%Y-%m")
# Get or create usage summary
usage_summary = db_track.query(UsageSummary).filter(
UsageSummary.user_id == user_id,
UsageSummary.billing_period == current_period
).first()
if not usage_summary:
usage_summary = UsageSummary(
user_id=user_id,
billing_period=current_period
)
db_track.add(usage_summary)
db_track.commit()
# Calculate cost using pricing service
cost_info = pricing_service_track.get_pricing_for_provider_model(
APIProvider.VIDEO,
model_name
)
cost_per_video = cost_info.get('cost_per_request', 0.10) if cost_info else 0.10
# Get "before" state for unified log
current_video_calls_before = getattr(usage_summary, 'video_calls', 0) or 0
current_video_cost = getattr(usage_summary, 'video_cost', 0.0) or 0.0
# Increment video_calls and track cost
new_video_calls = current_video_calls_before + 1
usage_summary.video_calls = new_video_calls
usage_summary.video_cost = current_video_cost + cost_per_video
usage_summary.total_calls = (usage_summary.total_calls or 0) + 1
usage_summary.total_cost = (usage_summary.total_cost or 0.0) + cost_per_video
# Get plan details for unified log (before commit, in case commit fails)
limits = pricing_service_track.get_user_limits(user_id)
plan_name = limits.get('plan_name', 'unknown') if limits else 'unknown'
tier = limits.get('tier', 'unknown') if limits else 'unknown'
video_limit = limits['limits'].get("video_calls", 0) if limits else 0
# Get image and image editing stats for unified log
current_image_calls = getattr(usage_summary, "stability_calls", 0) or 0
image_limit = limits['limits'].get("stability_calls", 0) if limits else 0
current_image_edit_calls = getattr(usage_summary, "image_edit_calls", 0) or 0
image_edit_limit = limits['limits'].get("image_edit_calls", 0) if limits else 0
# Create usage log entry for audit trail
usage_log = APIUsageLog(
user_id=user_id,
provider=APIProvider.VIDEO,
endpoint=f"/video-generation/{provider}",
method="POST",
model_used=model_name,
tokens_input=0,
tokens_output=0,
tokens_total=0,
cost_input=0.0,
cost_output=0.0,
cost_total=cost_per_video,
response_time=0.0, # Could track actual time if needed
status_code=200,
request_size=len(prompt.encode('utf-8')),
response_size=len(video_bytes),
billing_period=current_period
)
db_track.add(usage_log)
db_track.commit()
logger.info(f"[video_gen] ✅ Successfully tracked usage: user {user_id} -> 1 video call, ${cost_per_video:.4f} cost")
# UNIFIED SUBSCRIPTION LOG - Shows before/after state in one message
# Flush immediately to ensure it's visible in console/logs
import sys
log_message = f"""
[SUBSCRIPTION] Video Generation
├─ User: {user_id}
├─ Plan: {plan_name} ({tier})
├─ Provider: video
├─ Actual Provider: {provider}
├─ Model: {model_name or 'default'}
├─ Calls: {current_video_calls_before}{new_video_calls} / {video_limit if video_limit > 0 else ''}
├─ Images: {current_image_calls} / {image_limit if image_limit > 0 else ''}
├─ Image Editing: {current_image_edit_calls} / {image_edit_limit if image_edit_limit > 0 else ''}
└─ Status: ✅ Allowed & Tracked
"""
print(log_message, flush=True)
sys.stdout.flush()
except Exception as track_error:
logger.error(f"[video_gen] Error tracking usage: {track_error}", exc_info=True)
db_track.rollback()
# Don't fail video generation if tracking fails - video is already generated
finally:
db_track.close()
track_video_usage(
user_id=user_id,
provider=provider,
model_name=model_name,
prompt=prompt,
video_bytes=video_bytes,
)
return video_bytes
except HTTPException:
@@ -353,3 +294,139 @@ def ai_video_generate(
raise HTTPException(status_code=500, detail={"error": str(e)})
def track_video_usage(
*,
user_id: str,
provider: str,
model_name: str,
prompt: str,
video_bytes: bytes,
cost_override: Optional[float] = None,
) -> Dict[str, Any]:
"""
Track subscription usage for any video generation (text-to-video or image-to-video).
"""
from datetime import datetime
from models.subscription_models import APIProvider, APIUsageLog, UsageSummary
from services.database import get_db
db_track = next(get_db())
try:
logger.info(f"[video_gen] Starting usage tracking for user={user_id}, provider={provider}, model={model_name}")
pricing_service_track = PricingService(db_track)
current_period = pricing_service_track.get_current_billing_period(user_id) or datetime.now().strftime("%Y-%m")
logger.debug(f"[video_gen] Billing period: {current_period}")
usage_summary = (
db_track.query(UsageSummary)
.filter(
UsageSummary.user_id == user_id,
UsageSummary.billing_period == current_period,
)
.first()
)
if not usage_summary:
logger.debug(f"[video_gen] Creating new UsageSummary for user={user_id}, period={current_period}")
usage_summary = UsageSummary(
user_id=user_id,
billing_period=current_period,
)
db_track.add(usage_summary)
db_track.commit()
db_track.refresh(usage_summary)
else:
logger.debug(f"[video_gen] Found existing UsageSummary: video_calls={getattr(usage_summary, 'video_calls', 0)}")
cost_info = pricing_service_track.get_pricing_for_provider_model(
APIProvider.VIDEO,
model_name,
)
default_cost = 0.10
if cost_info and cost_info.get("cost_per_request") is not None:
default_cost = cost_info["cost_per_request"]
cost_per_video = cost_override if cost_override is not None else default_cost
logger.debug(f"[video_gen] Cost per video: ${cost_per_video} (override={cost_override}, default={default_cost})")
current_video_calls_before = getattr(usage_summary, "video_calls", 0) or 0
current_video_cost = getattr(usage_summary, "video_cost", 0.0) or 0.0
usage_summary.video_calls = current_video_calls_before + 1
usage_summary.video_cost = current_video_cost + cost_per_video
usage_summary.total_calls = (usage_summary.total_calls or 0) + 1
usage_summary.total_cost = (usage_summary.total_cost or 0.0) + cost_per_video
# Ensure the object is in the session
db_track.add(usage_summary)
logger.debug(f"[video_gen] Updated usage_summary: video_calls={current_video_calls_before}{usage_summary.video_calls}")
limits = pricing_service_track.get_user_limits(user_id)
plan_name = limits.get("plan_name", "unknown") if limits else "unknown"
tier = limits.get("tier", "unknown") if limits else "unknown"
video_limit = limits["limits"].get("video_calls", 0) if limits else 0
current_image_calls = getattr(usage_summary, "stability_calls", 0) or 0
image_limit = limits["limits"].get("stability_calls", 0) if limits else 0
current_image_edit_calls = getattr(usage_summary, "image_edit_calls", 0) or 0
image_edit_limit = limits["limits"].get("image_edit_calls", 0) if limits else 0
current_audio_calls = getattr(usage_summary, "audio_calls", 0) or 0
audio_limit = limits["limits"].get("audio_calls", 0) if limits else 0
# Only show ∞ for Enterprise tier when limit is 0 (unlimited)
audio_limit_display = audio_limit if (audio_limit > 0 or tier != 'enterprise') else ''
usage_log = APIUsageLog(
user_id=user_id,
provider=APIProvider.VIDEO,
endpoint=f"/video-generation/{provider}",
method="POST",
model_used=model_name,
tokens_input=0,
tokens_output=0,
tokens_total=0,
cost_input=0.0,
cost_output=0.0,
cost_total=cost_per_video,
response_time=0.0,
status_code=200,
request_size=len(prompt.encode("utf-8")),
response_size=len(video_bytes),
billing_period=current_period,
)
db_track.add(usage_log)
logger.debug(f"[video_gen] Flushing changes before commit...")
db_track.flush()
logger.debug(f"[video_gen] Committing usage tracking changes...")
db_track.commit()
db_track.refresh(usage_summary)
logger.debug(f"[video_gen] Commit successful. Final video_calls: {usage_summary.video_calls}, video_cost: {usage_summary.video_cost}")
video_limit_display = video_limit if video_limit > 0 else ''
log_message = f"""
[SUBSCRIPTION] Video Generation
├─ User: {user_id}
├─ Plan: {plan_name} ({tier})
├─ Provider: video
├─ Actual Provider: {provider}
├─ Model: {model_name or 'default'}
├─ Calls: {current_video_calls_before}{usage_summary.video_calls} / {video_limit_display}
├─ Images: {current_image_calls} / {image_limit if image_limit > 0 else ''}
├─ Image Editing: {current_image_edit_calls} / {image_edit_limit if image_edit_limit > 0 else ''}
├─ Audio: {current_audio_calls} / {audio_limit_display}
└─ Status: ✅ Allowed & Tracked
"""
logger.info(log_message)
return {
"previous_calls": current_video_calls_before,
"current_calls": usage_summary.video_calls,
"video_limit": video_limit,
"video_limit_display": video_limit_display,
"cost_per_video": cost_per_video,
"total_video_cost": usage_summary.video_cost,
}
except Exception as track_error:
logger.error(f"[video_gen] Error tracking usage: {track_error}", exc_info=True)
logger.error(f"[video_gen] Exception type: {type(track_error).__name__}", exc_info=True)
db_track.rollback()
finally:
db_track.close()

View File

@@ -414,7 +414,8 @@ class APIKeyManager:
'SERPER_API_KEY',
'METAPHOR_API_KEY',
'FIRECRAWL_API_KEY',
'STABILITY_API_KEY'
'STABILITY_API_KEY',
'WAVESPEED_API_KEY',
]
for provider in providers:

View File

@@ -288,4 +288,90 @@ class StoryAudioGenerationService:
logger.info(f"[StoryAudioGeneration] Generated {len(audio_results)} audio files out of {total_scenes} scenes")
return audio_results
def generate_ai_audio(
self,
scene_number: int,
scene_title: str,
text: str,
user_id: str,
voice_id: str = "Wise_Woman",
speed: float = 1.0,
volume: float = 1.0,
pitch: float = 0.0,
emotion: str = "happy",
) -> Dict[str, Any]:
"""
Generate AI audio for a single scene using main_audio_generation.
Parameters:
scene_number (int): Scene number.
scene_title (str): Scene title.
text (str): Text to convert to speech.
user_id (str): Clerk user ID for subscription checking.
voice_id (str): Voice ID for AI audio generation (default: "Wise_Woman").
speed (float): Speech speed (0.5-2.0, default: 1.0).
volume (float): Speech volume (0.1-10.0, default: 1.0).
pitch (float): Speech pitch (-12 to 12, default: 0.0).
emotion (str): Emotion for speech (default: "happy").
Returns:
Dict[str, Any]: Audio metadata including file path, URL, and scene info.
"""
if not text or not text.strip():
raise ValueError(f"Scene {scene_number} ({scene_title}) requires non-empty text")
try:
logger.info(f"[StoryAudioGeneration] Generating AI audio for scene {scene_number}: {scene_title}")
logger.debug(f"[StoryAudioGeneration] Text length: {len(text)} characters, voice: {voice_id}")
# Import main_audio_generation
from services.llm_providers.main_audio_generation import generate_audio
# Generate audio using main_audio_generation service
result = generate_audio(
text=text.strip(),
voice_id=voice_id,
speed=speed,
volume=volume,
pitch=pitch,
emotion=emotion,
user_id=user_id,
)
# Save audio to file
audio_filename = self._generate_audio_filename(scene_number, scene_title)
audio_path = self.output_dir / audio_filename
with open(audio_path, "wb") as f:
f.write(result.audio_bytes)
logger.info(f"[StoryAudioGeneration] Saved AI audio to: {audio_path} ({result.file_size} bytes)")
# Calculate cost (for response)
character_count = result.text_length
cost_per_1000_chars = 0.05
cost = (character_count / 1000.0) * cost_per_1000_chars
# Return audio metadata
return {
"scene_number": scene_number,
"scene_title": scene_title,
"audio_path": str(audio_path),
"audio_filename": audio_filename,
"audio_url": f"/api/story/audio/{audio_filename}",
"provider": result.provider,
"model": result.model,
"voice_id": result.voice_id,
"text_length": result.text_length,
"file_size": result.file_size,
"cost": cost,
}
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit)
raise
except Exception as e:
logger.error(f"[StoryAudioGeneration] Error generating AI audio for scene {scene_number}: {e}")
raise RuntimeError(f"Failed to generate AI audio for scene {scene_number}: {str(e)}") from e

View File

@@ -193,4 +193,82 @@ class StoryImageGenerationService:
logger.info(f"[StoryImageGeneration] Generated {len(image_results)} images out of {total_scenes} scenes")
return image_results
def regenerate_scene_image(
self,
scene_number: int,
scene_title: str,
prompt: str,
user_id: str,
provider: Optional[str] = None,
width: int = 1024,
height: int = 1024,
model: Optional[str] = None
) -> Dict[str, Any]:
"""
Regenerate an image for a single scene using a direct prompt (no AI prompt generation).
Parameters:
scene_number (int): Scene number.
scene_title (str): Scene title.
prompt (str): Direct prompt to use for image generation.
user_id (str): Clerk user ID for subscription checking.
provider (str, optional): Image generation provider (gemini, huggingface, stability).
width (int): Image width (default: 1024).
height (int): Image height (default: 1024).
model (str, optional): Model to use for image generation.
Returns:
Dict[str, Any]: Image metadata including file path, URL, and scene info.
"""
if not prompt or not prompt.strip():
raise ValueError(f"Scene {scene_number} ({scene_title}) requires a non-empty prompt")
try:
logger.info(f"[StoryImageGeneration] Regenerating image for scene {scene_number}: {scene_title}")
logger.debug(f"[StoryImageGeneration] Using direct prompt: {prompt[:100]}...")
# Generate image using main_image_generation service with the direct prompt
image_options = {
"provider": provider,
"width": width,
"height": height,
"model": model,
}
result: ImageGenerationResult = generate_image(
prompt=prompt.strip(),
options=image_options,
user_id=user_id
)
# Save image to file
image_filename = self._generate_image_filename(scene_number, scene_title)
image_path = self.output_dir / image_filename
with open(image_path, "wb") as f:
f.write(result.image_bytes)
logger.info(f"[StoryImageGeneration] Saved regenerated image to: {image_path}")
# Return image metadata
return {
"scene_number": scene_number,
"scene_title": scene_title,
"image_path": str(image_path),
"image_filename": image_filename,
"image_url": f"/api/story/images/{image_filename}",
"width": result.width,
"height": result.height,
"provider": result.provider,
"model": result.model,
"seed": result.seed,
}
except HTTPException:
# Re-raise HTTPExceptions (e.g., 429 subscription limit)
raise
except Exception as e:
logger.error(f"[StoryImageGeneration] Error regenerating image for scene {scene_number}: {e}")
raise RuntimeError(f"Failed to regenerate image for scene {scene_number}: {str(e)}") from e

View File

@@ -220,35 +220,41 @@ class StoryVideoGenerationService:
def generate_story_video(
self,
scenes: List[Dict[str, Any]],
image_paths: List[str],
image_paths: List[Optional[str]],
audio_paths: List[str],
user_id: str,
story_title: str = "Story",
fps: int = 24,
transition_duration: float = 0.5,
progress_callback: Optional[callable] = None
progress_callback: Optional[callable] = None,
video_paths: Optional[List[Optional[str]]] = None
) -> Dict[str, Any]:
"""
Generate a complete story video from multiple scenes.
Parameters:
scenes (List[Dict[str, Any]]): List of scene data.
image_paths (List[str]): List of image file paths for each scene.
image_paths (List[Optional[str]]): List of image file paths (None if scene has animated video).
audio_paths (List[str]): List of audio file paths for each scene.
user_id (str): Clerk user ID for subscription checking.
story_title (str): Title of the story (default: "Story").
fps (int): Frames per second for video (default: 24).
transition_duration (float): Duration of transitions between scenes in seconds (default: 0.5).
progress_callback (callable, optional): Callback function for progress updates.
video_paths (Optional[List[Optional[str]]]): List of animated video file paths (None if scene has static image).
Returns:
Dict[str, Any]: Video metadata including file path, URL, and story info.
"""
if not scenes or not image_paths or not audio_paths:
raise ValueError("Scenes, image paths, and audio paths are required")
if not scenes or not audio_paths:
raise ValueError("Scenes and audio paths are required")
if len(scenes) != len(image_paths) or len(scenes) != len(audio_paths):
raise ValueError("Number of scenes, image paths, and audio paths must match")
if len(scenes) != len(audio_paths):
raise ValueError("Number of scenes and audio paths must match")
video_paths = video_paths or [None] * len(scenes)
if len(video_paths) != len(scenes):
video_paths = video_paths + [None] * (len(scenes) - len(video_paths))
try:
logger.info(f"[StoryVideoGeneration] Generating story video for {len(scenes)} scenes")
@@ -293,36 +299,59 @@ class StoryVideoGenerationService:
scene_clips = []
total_duration = 0.0
for idx, (scene, image_path, audio_path) in enumerate(zip(scenes, image_paths, audio_paths)):
# Import VideoFileClip for animated videos
try:
from moviepy import VideoFileClip
except ImportError:
VideoFileClip = None
for idx, (scene, image_path, audio_path, video_path) in enumerate(zip(scenes, image_paths, audio_paths, video_paths)):
try:
scene_number = scene.get("scene_number", idx + 1)
scene_title = scene.get("title", "Untitled")
logger.info(f"[StoryVideoGeneration] Processing scene {scene_number}/{len(scenes)}: {scene_title}")
# Load image and audio
image_file = Path(image_path)
audio_file = Path(audio_path)
if not image_file.exists():
logger.warning(f"[StoryVideoGeneration] Image not found: {image_path}, skipping scene {scene_number}")
continue
if not audio_file.exists():
logger.warning(f"[StoryVideoGeneration] Audio not found: {audio_path}, skipping scene {scene_number}")
continue
# Load audio to get duration
# Load audio
audio_clip = AudioFileClip(str(audio_file))
audio_duration = audio_clip.duration
# Create image clip (MoviePy v2: use with_* API)
image_clip = ImageClip(str(image_file)).with_duration(audio_duration)
image_clip = image_clip.with_fps(fps)
# Prefer animated video if available
if video_path and Path(video_path).exists():
logger.info(f"[StoryVideoGeneration] Using animated video for scene {scene_number}: {video_path}")
# Load animated video
if VideoFileClip is None:
raise RuntimeError("VideoFileClip not available - MoviePy may not be fully installed")
video_clip = VideoFileClip(str(video_path))
# Replace audio with the preferred audio (AI or free)
video_clip = video_clip.with_audio(audio_clip)
# Match duration to audio if needed
if video_clip.duration > audio_duration:
video_clip = video_clip.subclip(0, audio_duration)
elif video_clip.duration < audio_duration:
# Loop the video if it's shorter than audio
loops_needed = int(audio_duration / video_clip.duration) + 1
video_clip = concatenate_videoclips([video_clip] * loops_needed).subclip(0, audio_duration)
video_clip = video_clip.with_audio(audio_clip)
elif image_path and Path(image_path).exists():
# Fall back to static image
logger.info(f"[StoryVideoGeneration] Using static image for scene {scene_number}: {image_path}")
image_file = Path(image_path)
# Create image clip (MoviePy v2: use with_* API)
image_clip = ImageClip(str(image_file)).with_duration(audio_duration)
image_clip = image_clip.with_fps(fps)
# Set audio to image clip
video_clip = image_clip.with_audio(audio_clip)
else:
logger.warning(f"[StoryVideoGeneration] No video or image found for scene {scene_number}, skipping")
continue
# Set audio to image clip
video_clip = image_clip.with_audio(audio_clip)
scene_clips.append(video_clip)
total_duration += audio_duration
# Call progress callback if provided

View File

@@ -19,10 +19,18 @@ import re
from models.api_monitoring import APIRequest, APIEndpointStats, SystemHealth, CachePerformance
from models.subscription_models import APIProvider
from services.database import get_db
from .usage_tracking_service import UsageTrackingService
from .pricing_service import PricingService
def _get_db_session():
"""
Get a database session with lazy import to survive hot reloads.
Uvicorn's reloader can sometimes clear module-level imports.
"""
from services.database import get_db
return next(get_db())
class DatabaseAPIMonitor:
"""Database-backed API monitoring with usage tracking and subscription management."""
@@ -145,8 +153,9 @@ async def check_usage_limits_middleware(request: Request, user_id: str, request_
except Exception:
pass
db = None
try:
db = next(get_db())
db = _get_db_session()
api_monitor = DatabaseAPIMonitor()
# Detect if this is an API call that should be rate limited
@@ -203,14 +212,15 @@ async def check_usage_limits_middleware(request: Request, user_id: str, request_
# Don't block requests if usage checking fails
return None
finally:
db.close()
if db is not None:
db.close()
async def monitoring_middleware(request: Request, call_next):
"""Enhanced FastAPI middleware for monitoring API calls with usage tracking."""
start_time = time.time()
# Get database session
db = next(get_db())
db = _get_db_session()
# Extract request details - Enhanced user identification
user_id = None
@@ -340,8 +350,9 @@ async def monitoring_middleware(request: Request, call_next):
async def get_monitoring_stats(minutes: int = 5) -> Dict[str, Any]:
"""Get current monitoring statistics."""
db = next(get_db())
db = None
try:
db = _get_db_session()
# Placeholder to match old API; heavy stats handled elsewhere
return {
'timestamp': datetime.utcnow().isoformat(),
@@ -354,12 +365,14 @@ async def get_monitoring_stats(minutes: int = 5) -> Dict[str, Any]:
'system_health': {'status': 'healthy', 'error_rate': 0.0}
}
finally:
db.close()
if db is not None:
db.close()
async def get_lightweight_stats() -> Dict[str, Any]:
"""Get lightweight stats for dashboard header."""
db = next(get_db())
db = None
try:
db = _get_db_session()
# Minimal viable placeholder values
now = datetime.utcnow()
return {
@@ -371,4 +384,5 @@ async def get_lightweight_stats() -> Dict[str, Any]:
'timestamp': now.isoformat()
}
finally:
db.close()
if db is not None:
db.close()

View File

@@ -420,3 +420,54 @@ def validate_video_generation_operations(
'message': f"Failed to validate video generation: {str(e)}"
}
)
def validate_scene_animation_operation(
pricing_service: PricingService,
user_id: str,
) -> None:
"""
Validate the per-scene animation workflow before API calls.
"""
try:
operations_to_validate = [
{
'provider': APIProvider.VIDEO,
'tokens_requested': 0,
'actual_provider_name': 'wavespeed',
'operation_type': 'scene_animation',
}
]
can_proceed, message, error_details = pricing_service.check_comprehensive_limits(
user_id=user_id,
operations=operations_to_validate,
)
if not can_proceed:
logger.error(f"[Pre-flight Validator] Scene animation blocked for user {user_id}: {message}")
usage_info = error_details.get('usage_info', {}) if error_details else {}
provider = usage_info.get('provider', 'video') if usage_info else 'video'
raise HTTPException(
status_code=429,
detail={
'error': message,
'message': message,
'provider': provider,
'usage_info': usage_info if usage_info else error_details,
}
)
logger.info(f"[Pre-flight Validator] ✅ Scene animation validated for user {user_id}")
except HTTPException:
raise
except Exception as e:
logger.error(f"[Pre-flight Validator] Error validating scene animation: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail={
'error': f"Failed to validate scene animation: {str(e)}",
'message': f"Failed to validate scene animation: {str(e)}",
},
)

View File

@@ -307,6 +307,41 @@ class PricingService:
"model_name": "default",
"cost_per_request": 0.10, # $0.10 per video generation (estimated)
"description": "AI Video Generation default pricing"
},
{
"provider": APIProvider.VIDEO,
"model_name": "kling-v2.5-turbo-std-5s",
"cost_per_request": 0.21,
"description": "WaveSpeed Kling v2.5 Turbo Std Image-to-Video (5 seconds)"
},
{
"provider": APIProvider.VIDEO,
"model_name": "kling-v2.5-turbo-std-10s",
"cost_per_request": 0.42,
"description": "WaveSpeed Kling v2.5 Turbo Std Image-to-Video (10 seconds)"
},
{
"provider": APIProvider.VIDEO,
"model_name": "wavespeed-ai/infinitetalk",
"cost_per_request": 0.30,
"description": "WaveSpeed InfiniteTalk (image + audio to talking avatar video)"
},
# Audio Generation Pricing (Minimax Speech 02 HD via WaveSpeed)
{
"provider": APIProvider.AUDIO,
"model_name": "minimax/speech-02-hd",
"cost_per_input_token": 0.00005, # $0.05 per 1,000 characters (every character is 1 token)
"cost_per_output_token": 0.0, # No output tokens for audio
"cost_per_request": 0.0, # Pricing is per character, not per request
"description": "AI Audio Generation (Text-to-Speech) - Minimax Speech 02 HD via WaveSpeed"
},
{
"provider": APIProvider.AUDIO,
"model_name": "default",
"cost_per_input_token": 0.00005, # $0.05 per 1,000 characters default
"cost_per_output_token": 0.0,
"cost_per_request": 0.0,
"description": "AI Audio Generation default pricing"
}
]
@@ -358,6 +393,7 @@ class PricingService:
"exa_calls_limit": 100,
"video_calls_limit": 0, # No video generation for free tier
"image_edit_calls_limit": 10, # 10 AI image editing calls/month
"audio_calls_limit": 20, # 20 AI audio generation calls/month
"gemini_tokens_limit": 100000,
"monthly_cost_limit": 0.0,
"features": ["basic_content_generation", "limited_research"],
@@ -381,6 +417,7 @@ class PricingService:
"exa_calls_limit": 500,
"video_calls_limit": 20, # 20 videos/month for basic plan
"image_edit_calls_limit": 30, # 30 AI image editing calls/month
"audio_calls_limit": 50, # 50 AI audio generation calls/month
"gemini_tokens_limit": 20000, # Increased from 5000 for better stability
"openai_tokens_limit": 20000, # Increased from 5000 for better stability
"anthropic_tokens_limit": 20000, # Increased from 5000 for better stability
@@ -406,6 +443,7 @@ class PricingService:
"exa_calls_limit": 2000,
"video_calls_limit": 50, # 50 videos/month for pro plan
"image_edit_calls_limit": 100, # 100 AI image editing calls/month
"audio_calls_limit": 200, # 200 AI audio generation calls/month
"gemini_tokens_limit": 5000000,
"openai_tokens_limit": 2500000,
"anthropic_tokens_limit": 1000000,
@@ -431,6 +469,7 @@ class PricingService:
"exa_calls_limit": 0, # Unlimited
"video_calls_limit": 0, # Unlimited for enterprise
"image_edit_calls_limit": 0, # Unlimited image editing for enterprise
"audio_calls_limit": 0, # Unlimited audio generation for enterprise
"gemini_tokens_limit": 0,
"openai_tokens_limit": 0,
"anthropic_tokens_limit": 0,
@@ -651,6 +690,7 @@ class PricingService:
'stability_calls': plan.stability_calls_limit,
'video_calls': getattr(plan, 'video_calls_limit', 0), # Support missing column
'image_edit_calls': getattr(plan, 'image_edit_calls_limit', 0), # Support missing column
'audio_calls': getattr(plan, 'audio_calls_limit', 0), # Support missing column
# Token limits
'gemini_tokens': plan.gemini_tokens_limit,
'openai_tokens': plan.openai_tokens_limit,

View File

@@ -31,6 +31,7 @@ def ensure_subscription_plan_columns(db: Session) -> None:
"exa_calls_limit": "INTEGER DEFAULT 0",
"video_calls_limit": "INTEGER DEFAULT 0",
"image_edit_calls_limit": "INTEGER DEFAULT 0",
"audio_calls_limit": "INTEGER DEFAULT 0",
}
for col_name, ddl in required_columns.items():
@@ -84,6 +85,8 @@ def ensure_usage_summaries_columns(db: Session) -> None:
"video_cost": "REAL DEFAULT 0.0",
"image_edit_calls": "INTEGER DEFAULT 0",
"image_edit_cost": "REAL DEFAULT 0.0",
"audio_calls": "INTEGER DEFAULT 0",
"audio_cost": "REAL DEFAULT 0.0",
}
for col_name, ddl in required_columns.items():

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,471 @@
from __future__ import annotations
import json
import time
from typing import Any, Dict, Optional
import requests
from fastapi import HTTPException
from requests import exceptions as requests_exceptions
from services.onboarding.api_key_manager import APIKeyManager
from utils.logger_utils import get_service_logger
logger = get_service_logger("wavespeed.client")
class WaveSpeedClient:
"""
Thin HTTP client for the WaveSpeed AI API.
Handles authentication, submission, and polling helpers.
"""
BASE_URL = "https://api.wavespeed.ai/api/v3"
def __init__(self, api_key: Optional[str] = None):
manager = APIKeyManager()
self.api_key = api_key or manager.get_api_key("wavespeed")
if not self.api_key:
raise RuntimeError("WAVESPEED_API_KEY is not configured. Please add it to your environment.")
def _headers(self) -> Dict[str, str]:
return {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
}
def submit_image_to_video(
self,
model_path: str,
payload: Dict[str, Any],
timeout: int = 30,
) -> str:
"""
Submit an image-to-video generation request.
Returns the prediction ID for polling.
"""
url = f"{self.BASE_URL}/{model_path}"
logger.info(f"[WaveSpeed] Submitting request to {url}")
response = requests.post(url, headers=self._headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Submission failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed image-to-video submission failed",
"status_code": response.status_code,
"response": response.text,
},
)
data = response.json().get("data")
if not data or "id" not in data:
logger.error(f"[WaveSpeed] Unexpected submission response: {response.text}")
raise HTTPException(
status_code=502,
detail={"error": "WaveSpeed response missing prediction id"},
)
prediction_id = data["id"]
logger.info(f"[WaveSpeed] Submitted request: {prediction_id}")
return prediction_id
def get_prediction_result(self, prediction_id: str, timeout: int = 120) -> Dict[str, Any]:
"""
Fetch the current status/result for a prediction.
"""
url = f"{self.BASE_URL}/predictions/{prediction_id}/result"
try:
response = requests.get(url, headers={"Authorization": f"Bearer {self.api_key}"}, timeout=timeout)
except requests_exceptions.Timeout as exc:
raise HTTPException(
status_code=504,
detail={
"error": "WaveSpeed polling request timed out",
"prediction_id": prediction_id,
"resume_available": True,
"exception": str(exc),
},
) from exc
except requests_exceptions.RequestException as exc:
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed polling request failed",
"prediction_id": prediction_id,
"resume_available": True,
"exception": str(exc),
},
) from exc
if response.status_code != 200:
logger.error(f"[WaveSpeed] Polling failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed prediction polling failed",
"status_code": response.status_code,
"response": response.text,
},
)
result = response.json().get("data")
if not result:
raise HTTPException(status_code=502, detail={"error": "WaveSpeed polling response missing data"})
return result
def poll_until_complete(
self,
prediction_id: str,
timeout_seconds: int = 240,
interval_seconds: float = 1.0,
) -> Dict[str, Any]:
"""
Poll WaveSpeed until the job completes, fails, or times out.
"""
start_time = time.time()
while True:
try:
result = self.get_prediction_result(prediction_id)
except HTTPException as exc:
detail = exc.detail or {}
if isinstance(detail, dict):
detail.setdefault("prediction_id", prediction_id)
detail.setdefault("resume_available", True)
detail.setdefault("error", detail.get("error", "WaveSpeed polling failed"))
raise HTTPException(status_code=exc.status_code, detail=detail) from exc
status = result.get("status")
if status == "completed":
logger.info(f"[WaveSpeed] Prediction {prediction_id} completed.")
return result
if status == "failed":
logger.error(f"[WaveSpeed] Prediction {prediction_id} failed: {result.get('error')}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed animation failed",
"prediction_id": prediction_id,
"details": result.get("error"),
},
)
elapsed = time.time() - start_time
if elapsed > timeout_seconds:
logger.error(f"[WaveSpeed] Prediction {prediction_id} timed out after {timeout_seconds}s")
raise HTTPException(
status_code=504,
detail={
"error": "WaveSpeed animation timed out",
"prediction_id": prediction_id,
"details": result,
},
)
logger.debug(f"[WaveSpeed] Prediction {prediction_id} status={status}. Waiting...")
time.sleep(interval_seconds)
def optimize_prompt(
self,
text: str,
mode: str = "image",
style: str = "default",
image: Optional[str] = None,
enable_sync_mode: bool = True,
timeout: int = 30,
) -> str:
"""
Optimize a prompt using WaveSpeed prompt optimizer.
Args:
text: The prompt text to optimize
mode: "image" or "video" (default: "image")
style: "default", "artistic", "photographic", "technical", "anime", "realistic" (default: "default")
image: Base64-encoded image for context (optional)
enable_sync_mode: If True, wait for result and return it directly (default: True)
timeout: Request timeout in seconds (default: 30)
Returns:
Optimized prompt text
"""
model_path = "wavespeed-ai/prompt-optimizer"
url = f"{self.BASE_URL}/{model_path}"
payload = {
"text": text,
"mode": mode,
"style": style,
"enable_sync_mode": enable_sync_mode,
}
if image:
payload["image"] = image
logger.info(f"[WaveSpeed] Optimizing prompt via {url} (mode={mode}, style={style})")
response = requests.post(url, headers=self._headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Prompt optimization failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed prompt optimization failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
# Handle sync mode - result should be directly in outputs
if enable_sync_mode:
outputs = data.get("outputs") or []
if not outputs:
logger.error(f"[WaveSpeed] No outputs in sync mode response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed prompt optimizer returned no outputs",
)
# Extract optimized prompt from outputs
# In sync mode, outputs[0] should be the optimized text directly (or a URL to fetch)
optimized_prompt = None
if isinstance(outputs, list) and len(outputs) > 0:
first_output = outputs[0]
# If it's a string that looks like a URL, fetch it
if isinstance(first_output, str):
if first_output.startswith("http://") or first_output.startswith("https://"):
logger.info(f"[WaveSpeed] Fetching optimized prompt from URL: {first_output}")
url_response = requests.get(first_output, timeout=timeout)
if url_response.status_code == 200:
optimized_prompt = url_response.text.strip()
else:
logger.error(f"[WaveSpeed] Failed to fetch prompt from URL: {url_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to fetch optimized prompt from WaveSpeed URL",
)
else:
# It's already the text
optimized_prompt = first_output
elif isinstance(first_output, dict):
optimized_prompt = first_output.get("text") or first_output.get("prompt") or first_output.get("output")
if not optimized_prompt:
logger.error(f"[WaveSpeed] Could not extract optimized prompt from outputs: {outputs}")
raise HTTPException(
status_code=502,
detail="WaveSpeed prompt optimizer output format not recognized",
)
logger.info(f"[WaveSpeed] Prompt optimized successfully (length: {len(optimized_prompt)} chars)")
return optimized_prompt
# Async mode - return prediction ID for polling
prediction_id = data.get("id")
if not prediction_id:
logger.error(f"[WaveSpeed] No prediction ID in async response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed response missing prediction id for async mode",
)
# Poll for result
result = self.poll_until_complete(prediction_id, timeout_seconds=60, interval_seconds=0.5)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed prompt optimizer returned no outputs")
# Extract optimized prompt from outputs
# In async mode, outputs[0] is typically a URL that needs to be fetched
optimized_prompt = None
if isinstance(outputs, list) and len(outputs) > 0:
first_output = outputs[0]
# In async mode, it's usually a URL to fetch
if isinstance(first_output, str):
if first_output.startswith("http://") or first_output.startswith("https://"):
logger.info(f"[WaveSpeed] Fetching optimized prompt from URL: {first_output}")
url_response = requests.get(first_output, timeout=timeout)
if url_response.status_code == 200:
optimized_prompt = url_response.text.strip()
else:
logger.error(f"[WaveSpeed] Failed to fetch prompt from URL: {url_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to fetch optimized prompt from WaveSpeed URL",
)
else:
# If it's already text (shouldn't happen in async mode, but handle it)
optimized_prompt = first_output
elif isinstance(first_output, dict):
optimized_prompt = first_output.get("text") or first_output.get("prompt") or first_output.get("output")
if not optimized_prompt:
raise HTTPException(
status_code=502,
detail="WaveSpeed prompt optimizer output format not recognized",
)
logger.info(f"[WaveSpeed] Prompt optimized successfully (length: {len(optimized_prompt)} chars)")
return optimized_prompt
def generate_speech(
self,
text: str,
voice_id: str,
speed: float = 1.0,
volume: float = 1.0,
pitch: float = 0.0,
emotion: str = "happy",
enable_sync_mode: bool = True,
timeout: int = 60,
**kwargs
) -> bytes:
"""
Generate speech audio using Minimax Speech 02 HD via WaveSpeed.
Args:
text: Text to convert to speech (max 10000 characters)
voice_id: Voice ID (e.g., "Wise_Woman", "Friendly_Person", etc.)
speed: Speech speed (0.5-2.0, default: 1.0)
volume: Speech volume (0.1-10.0, default: 1.0)
pitch: Speech pitch (-12 to 12, default: 0.0)
emotion: Emotion ("happy", "sad", "angry", etc., default: "happy")
enable_sync_mode: If True, wait for result and return it directly (default: True)
timeout: Request timeout in seconds (default: 60)
**kwargs: Additional parameters (sample_rate, bitrate, format, etc.)
Returns:
bytes: Generated audio bytes
"""
model_path = "minimax/speech-02-hd"
url = f"{self.BASE_URL}/{model_path}"
payload = {
"text": text,
"voice_id": voice_id,
"speed": speed,
"volume": volume,
"pitch": pitch,
"emotion": emotion,
"enable_sync_mode": enable_sync_mode,
}
# Add optional parameters
optional_params = [
"english_normalization",
"sample_rate",
"bitrate",
"channel",
"format",
"language_boost",
]
for param in optional_params:
if param in kwargs:
payload[param] = kwargs[param]
logger.info(f"[WaveSpeed] Generating speech via {url} (voice={voice_id}, text_length={len(text)})")
response = requests.post(url, headers=self._headers(), json=payload, timeout=timeout)
if response.status_code != 200:
logger.error(f"[WaveSpeed] Speech generation failed: {response.status_code} {response.text}")
raise HTTPException(
status_code=502,
detail={
"error": "WaveSpeed speech generation failed",
"status_code": response.status_code,
"response": response.text,
},
)
response_json = response.json()
data = response_json.get("data") or response_json
# Handle sync mode - result should be directly in outputs
if enable_sync_mode:
outputs = data.get("outputs") or []
if not outputs:
logger.error(f"[WaveSpeed] No outputs in sync mode response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed speech generator returned no outputs",
)
# Extract audio URL from outputs
audio_url = None
if isinstance(outputs, list) and len(outputs) > 0:
first_output = outputs[0]
if isinstance(first_output, str):
audio_url = first_output
elif isinstance(first_output, dict):
audio_url = first_output.get("url") or first_output.get("output")
if not audio_url or not (audio_url.startswith("http://") or audio_url.startswith("https://")):
logger.error(f"[WaveSpeed] Invalid audio URL in outputs: {outputs}")
raise HTTPException(
status_code=502,
detail="WaveSpeed speech generator output format not recognized",
)
# Fetch audio bytes from URL
logger.info(f"[WaveSpeed] Fetching audio from URL: {audio_url}")
audio_response = requests.get(audio_url, timeout=timeout)
if audio_response.status_code == 200:
audio_bytes = audio_response.content
logger.info(f"[WaveSpeed] Speech generated successfully (size: {len(audio_bytes)} bytes)")
return audio_bytes
else:
logger.error(f"[WaveSpeed] Failed to fetch audio from URL: {audio_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to fetch generated audio from WaveSpeed URL",
)
# Async mode - return prediction ID for polling
prediction_id = data.get("id")
if not prediction_id:
logger.error(f"[WaveSpeed] No prediction ID in async response: {response.text}")
raise HTTPException(
status_code=502,
detail="WaveSpeed response missing prediction id for async mode",
)
# Poll for result
result = self.poll_until_complete(prediction_id, timeout_seconds=120, interval_seconds=0.5)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed speech generator returned no outputs")
# Extract audio URL and fetch
audio_url = None
if isinstance(outputs, list) and len(outputs) > 0:
first_output = outputs[0]
if isinstance(first_output, str):
audio_url = first_output
elif isinstance(first_output, dict):
audio_url = first_output.get("url") or first_output.get("output")
if not audio_url or not (audio_url.startswith("http://") or audio_url.startswith("https://")):
raise HTTPException(
status_code=502,
detail="WaveSpeed speech generator output format not recognized",
)
# Fetch audio bytes
logger.info(f"[WaveSpeed] Fetching audio from URL: {audio_url}")
audio_response = requests.get(audio_url, timeout=timeout)
if audio_response.status_code == 200:
audio_bytes = audio_response.content
logger.info(f"[WaveSpeed] Speech generated successfully (size: {len(audio_bytes)} bytes)")
return audio_bytes
else:
logger.error(f"[WaveSpeed] Failed to fetch audio from URL: {audio_response.status_code}")
raise HTTPException(
status_code=502,
detail="Failed to fetch generated audio from WaveSpeed URL",
)

View File

@@ -0,0 +1,122 @@
from __future__ import annotations
import base64
from typing import Any, Dict, Optional
import requests
from fastapi import HTTPException
from loguru import logger
from .client import WaveSpeedClient
from .kling_animation import generate_animation_prompt
INFINITALK_MODEL_PATH = "wavespeed-ai/infinitetalk"
INFINITALK_MODEL_NAME = "wavespeed-ai/infinitetalk"
INFINITALK_DEFAULT_COST = 0.30 # $0.30 per 5 seconds at 720p tier
MAX_IMAGE_BYTES = 10 * 1024 * 1024 # 10MB
MAX_AUDIO_BYTES = 50 * 1024 * 1024 # 50MB safety cap
def _as_data_uri(content_bytes: bytes, mime_type: str) -> str:
encoded = base64.b64encode(content_bytes).decode("utf-8")
return f"data:{mime_type};base64,{encoded}"
def animate_scene_with_voiceover(
*,
image_bytes: bytes,
audio_bytes: bytes,
scene_data: Dict[str, Any],
story_context: Dict[str, Any],
user_id: str,
resolution: str = "720p",
prompt_override: Optional[str] = None,
image_mime: str = "image/png",
audio_mime: str = "audio/mpeg",
client: Optional[WaveSpeedClient] = None,
) -> Dict[str, Any]:
"""
Animate a scene image with narration audio using WaveSpeed InfiniteTalk.
Returns dict with video bytes, prompt used, model name, and cost.
"""
if not image_bytes:
raise HTTPException(status_code=404, detail="Scene image bytes missing for animation.")
if not audio_bytes:
raise HTTPException(status_code=404, detail="Scene audio bytes missing for animation.")
if len(image_bytes) > MAX_IMAGE_BYTES:
raise HTTPException(
status_code=400,
detail="Scene image exceeds 10MB limit required by WaveSpeed InfiniteTalk.",
)
if len(audio_bytes) > MAX_AUDIO_BYTES:
raise HTTPException(
status_code=400,
detail="Scene audio exceeds 50MB limit allowed for InfiniteTalk requests.",
)
if resolution not in {"480p", "720p"}:
raise HTTPException(status_code=400, detail="Resolution must be '480p' or '720p'.")
animation_prompt = prompt_override or generate_animation_prompt(scene_data, story_context, user_id)
payload = {
"image": _as_data_uri(image_bytes, image_mime),
"audio": _as_data_uri(audio_bytes, audio_mime),
"resolution": resolution,
}
if animation_prompt:
payload["prompt"] = animation_prompt
client = client or WaveSpeedClient()
prediction_id = client.submit_image_to_video(INFINITALK_MODEL_PATH, payload, timeout=60)
try:
result = client.poll_until_complete(prediction_id, timeout_seconds=600, interval_seconds=1.0)
except HTTPException as exc:
detail = exc.detail or {}
if isinstance(detail, dict):
detail.setdefault("prediction_id", prediction_id)
detail.setdefault("resume_available", True)
raise
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed InfiniteTalk completed but returned no outputs.")
video_url = outputs[0]
video_response = requests.get(video_url, timeout=180)
if video_response.status_code != 200:
raise HTTPException(
status_code=502,
detail={
"error": "Failed to download InfiniteTalk video",
"status_code": video_response.status_code,
"response": video_response.text[:200],
},
)
metadata = result.get("metadata") or {}
duration = metadata.get("duration_seconds") or metadata.get("duration") or 0
logger.info(
"[InfiniteTalk] Generated talking avatar video user=%s scene=%s resolution=%s size=%s bytes",
user_id,
scene_data.get("scene_number"),
resolution,
len(video_response.content),
)
return {
"video_bytes": video_response.content,
"prompt": animation_prompt,
"duration": duration or 5,
"model_name": INFINITALK_MODEL_NAME,
"cost": INFINITALK_DEFAULT_COST,
"provider": "wavespeed",
"source_video_url": video_url,
"prediction_id": prediction_id,
}

View File

@@ -0,0 +1,360 @@
from __future__ import annotations
import base64
import json
from typing import Any, Dict, Optional
import requests
from fastapi import HTTPException
from services.llm_providers.main_text_generation import llm_text_gen
from utils.logger_utils import get_service_logger
from .client import WaveSpeedClient
try:
import imghdr
except ModuleNotFoundError: # Python 3.13 removed imghdr
imghdr = None
logger = get_service_logger("wavespeed.kling_animation")
KLING_MODEL_PATH = "kwaivgi/kling-v2.5-turbo-std/image-to-video"
KLING_MODEL_5S = "kling-v2.5-turbo-std-5s"
KLING_MODEL_10S = "kling-v2.5-turbo-std-10s"
MAX_IMAGE_BYTES = 10 * 1024 * 1024 # 10 MB limit per docs
def _detect_image_mime(image_bytes: bytes) -> str:
if imghdr:
detected = imghdr.what(None, h=image_bytes)
if detected == "jpeg":
return "image/jpeg"
if detected == "png":
return "image/png"
if detected == "gif":
return "image/gif"
header = image_bytes[:8]
if header.startswith(b"\x89PNG"):
return "image/png"
if header[:2] == b"\xff\xd8":
return "image/jpeg"
if header[:3] in (b"GIF", b"GIF"):
return "image/gif"
return "image/png"
def _build_fallback_prompt(scene_data: Dict[str, Any], story_context: Dict[str, Any]) -> str:
title = (scene_data.get("title") or "Scene").strip()
description = (scene_data.get("description") or "").strip()
image_prompt = (scene_data.get("image_prompt") or "").strip()
tone = (story_context.get("story_tone") or "story").strip()
setting = (story_context.get("story_setting") or "the scene").strip()
parts = [
f"{title} cinematic motion shot.",
description[:220] if description else "",
f"Camera glides with subtle parallax over {setting}.",
f"Maintain a {tone} mood with natural lighting accents.",
f"Honor the original illustration details: {image_prompt[:200]}." if image_prompt else "",
"5-second sequence, gentle push-in, flowing cloth and atmospheric particles.",
]
fallback_prompt = " ".join(filter(None, parts))
return fallback_prompt.strip()
def _load_llm_json_response(response_text: Any) -> Dict[str, Any]:
"""Normalize responses from llm_text_gen (dict or JSON string)."""
if isinstance(response_text, dict):
return response_text
if isinstance(response_text, str):
return json.loads(response_text)
raise ValueError(f"Unexpected response type: {type(response_text)}")
def _generate_text_prompt(
*,
prompt: str,
system_prompt: str,
user_id: str,
fallback_prompt: str,
) -> str:
"""Fallback text generation when structured JSON parsing fails."""
try:
response = llm_text_gen(
prompt=prompt.strip(),
system_prompt=system_prompt,
user_id=user_id,
)
except HTTPException as exc:
if exc.status_code == 429:
raise
logger.warning(
"[AnimateScene] Text-mode prompt generation failed (%s). Using deterministic fallback.",
exc.detail,
)
return fallback_prompt
except Exception as exc:
logger.error(
"[AnimateScene] Unexpected error generating text prompt: %s",
exc,
exc_info=True,
)
return fallback_prompt
if isinstance(response, dict):
candidates = [
response.get("animation_prompt"),
response.get("prompt"),
response.get("text"),
]
for candidate in candidates:
if isinstance(candidate, str) and candidate.strip():
return candidate.strip()
# As a last resort, stringify the dict
response_text = json.dumps(response, ensure_ascii=False)
else:
response_text = str(response)
cleaned = response_text.strip()
return cleaned or fallback_prompt
def generate_animation_prompt(
scene_data: Dict[str, Any],
story_context: Dict[str, Any],
user_id: str,
) -> str:
"""
Generate an animation-focused prompt using llm_text_gen, falling back to a deterministic prompt if LLM fails.
"""
fallback_prompt = _build_fallback_prompt(scene_data, story_context)
system_prompt = (
"You are an expert cinematic animation director. "
"You transform static illustrated scenes into short cinematic motion clips. "
"Describe motion, camera behavior, atmosphere, and pacing."
)
description = scene_data.get("description", "")
image_prompt = scene_data.get("image_prompt", "")
title = scene_data.get("title", "")
tone = story_context.get("story_tone") or story_context.get("story_tone", "")
setting = story_context.get("story_setting") or story_context.get("story_setting", "")
prompt = f"""
Create a concise animation prompt (2-3 sentences) for a 5-second cinematic clip.
Scene Title: {title}
Description: {description}
Existing Image Prompt: {image_prompt}
Story Tone: {tone}
Setting: {setting}
Focus on:
- Motion of characters/objects
- Camera movement (pan, zoom, dolly, orbit)
- Atmosphere, lighting, and emotion
- Timing cues appropriate for a {tone or "story"} scene
Respond with JSON: {{"animation_prompt": "<prompt>"}}
"""
try:
response = llm_text_gen(
prompt=prompt.strip(),
system_prompt=system_prompt,
user_id=user_id,
json_struct={
"type": "object",
"properties": {
"animation_prompt": {
"type": "string",
"description": "A cinematic motion prompt for the WaveSpeed image-to-video model.",
}
},
"required": ["animation_prompt"],
},
)
structured = _load_llm_json_response(response)
animation_prompt = structured.get("animation_prompt")
if not animation_prompt or not isinstance(animation_prompt, str):
raise ValueError("Missing animation_prompt in structured response")
cleaned_prompt = animation_prompt.strip()
if not cleaned_prompt:
raise ValueError("animation_prompt is empty after trimming")
return cleaned_prompt
except HTTPException as exc:
if exc.status_code == 429:
raise
logger.warning(
"[AnimateScene] Structured LLM prompt generation failed (%s). Falling back to text parsing.",
exc.detail,
)
return _generate_text_prompt(
prompt=prompt,
system_prompt=system_prompt,
user_id=user_id,
fallback_prompt=fallback_prompt,
)
except (json.JSONDecodeError, ValueError, KeyError) as exc:
logger.warning(
"[AnimateScene] Failed to parse structured animation prompt (%s). Falling back to text parsing.",
exc,
)
return _generate_text_prompt(
prompt=prompt,
system_prompt=system_prompt,
user_id=user_id,
fallback_prompt=fallback_prompt,
)
except Exception as exc:
logger.error(
"[AnimateScene] Unexpected error generating animation prompt: %s",
exc,
exc_info=True,
)
return fallback_prompt
def animate_scene_image(
*,
image_bytes: bytes,
scene_data: Dict[str, Any],
story_context: Dict[str, Any],
user_id: str,
duration: int = 5,
guidance_scale: float = 0.5,
negative_prompt: Optional[str] = None,
client: Optional[WaveSpeedClient] = None,
) -> Dict[str, Any]:
"""
Animate a scene image using WaveSpeed Kling v2.5 Turbo Std.
Returns dict with video bytes, prompt used, model name, duration, and cost.
"""
if duration not in (5, 10):
raise HTTPException(status_code=400, detail="Duration must be 5 or 10 seconds for scene animation.")
if len(image_bytes) > MAX_IMAGE_BYTES:
raise HTTPException(
status_code=400,
detail="Scene image exceeds 10MB limit required by WaveSpeed."
)
guidance_scale = max(0.0, min(1.0, guidance_scale))
animation_prompt = generate_animation_prompt(scene_data, story_context, user_id)
image_b64 = base64.b64encode(image_bytes).decode("utf-8")
payload = {
"duration": duration,
"guidance_scale": guidance_scale,
"image": image_b64,
"prompt": animation_prompt,
}
if negative_prompt:
payload["negative_prompt"] = negative_prompt.strip()
client = client or WaveSpeedClient()
prediction_id = client.submit_image_to_video(KLING_MODEL_PATH, payload)
try:
result = client.poll_until_complete(prediction_id, timeout_seconds=240, interval_seconds=1.0)
except HTTPException as exc:
detail = exc.detail or {}
if isinstance(detail, dict):
detail.setdefault("prediction_id", prediction_id)
detail.setdefault("resume_available", True)
detail.setdefault("message", "WaveSpeed request is still processing. Use resume endpoint to fetch the video once ready.")
raise HTTPException(status_code=exc.status_code, detail=detail)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed completed but returned no outputs.")
video_url = outputs[0]
video_response = requests.get(video_url, timeout=60)
if video_response.status_code != 200:
raise HTTPException(
status_code=502,
detail={
"error": "Failed to download animation video",
"status_code": video_response.status_code,
"response": video_response.text[:200],
},
)
model_name = KLING_MODEL_5S if duration == 5 else KLING_MODEL_10S
cost = 0.21 if duration == 5 else 0.42
return {
"video_bytes": video_response.content,
"prompt": animation_prompt,
"duration": duration,
"model_name": model_name,
"cost": cost,
"provider": "wavespeed",
"source_video_url": video_url,
"prediction_id": prediction_id,
}
def resume_scene_animation(
*,
prediction_id: str,
duration: int,
user_id: str,
client: Optional[WaveSpeedClient] = None,
) -> Dict[str, Any]:
"""
Resume a previously submitted animation by fetching the completed result.
"""
if duration not in (5, 10):
raise HTTPException(status_code=400, detail="Duration must be 5 or 10 seconds for scene animation.")
client = client or WaveSpeedClient()
result = client.get_prediction_result(prediction_id, timeout=120)
status = result.get("status")
if status != "completed":
raise HTTPException(
status_code=409,
detail={
"error": "WaveSpeed prediction is not completed yet",
"prediction_id": prediction_id,
"status": status,
},
)
outputs = result.get("outputs") or []
if not outputs:
raise HTTPException(status_code=502, detail="WaveSpeed completed but returned no outputs.")
video_url = outputs[0]
video_response = requests.get(video_url, timeout=120)
if video_response.status_code != 200:
raise HTTPException(
status_code=502,
detail={
"error": "Failed to download animation video during resume",
"status_code": video_response.status_code,
"response": video_response.text[:200],
"prediction_id": prediction_id,
},
)
animation_prompt = result.get("prompt") or ""
model_name = KLING_MODEL_5S if duration == 5 else KLING_MODEL_10S
cost = 0.21 if duration == 5 else 0.42
logger.info("[AnimateScene] Resumed download for prediction=%s", prediction_id)
return {
"video_bytes": video_response.content,
"prompt": animation_prompt,
"duration": duration,
"model_name": model_name,
"cost": cost,
"provider": "wavespeed",
"source_video_url": video_url,
"prediction_id": prediction_id,
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 MiB

View File

@@ -0,0 +1,658 @@
# LinkedIn Writer: Multimedia Content Revamp
## Executive Summary
This document outlines the comprehensive revamp of ALwrity's LinkedIn Writer to transform it from a text-only content tool into a complete multimedia content creation platform. By integrating video generation, avatar creation, image generation, and voice cloning, LinkedIn Writer will enable users to create engaging, professional multimedia content that drives higher engagement on LinkedIn.
---
## Current State Analysis
### Existing LinkedIn Writer Features
**Current Capabilities**:
- Text content generation (posts, articles)
- Writing style optimization for LinkedIn
- Fact checking and credibility features
- Engagement optimization
- Brand voice consistency
- Industry-specific content
**Current Limitations**:
- Text-only content (no video)
- Basic image generation (limited integration)
- No audio/video narration
- No avatar/personal branding videos
- Limited multimedia options
- No video post creation
**Location**:
- Backend: `backend/api/linkedin_writer/`
- Frontend: `frontend/src/components/LinkedInWriter/`
---
## Proposed Enhancements
### 1. Video Content Creation
#### 1.1 LinkedIn Video Posts
**Feature**: Generate professional video posts for LinkedIn
**Use Cases**:
- Thought leadership videos
- Product announcements
- Company updates
- Industry insights
- Personal brand building
- Educational content
**Implementation**:
**Backend**: `backend/api/linkedin_writer/video_generation.py` (NEW)
```python
@router.post("/generate-video-post")
async def generate_linkedin_video_post(
request: LinkedInVideoPostRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> LinkedInVideoPostResponse:
"""
Generate LinkedIn video post with synchronized audio.
Uses WAN 2.5 for professional video generation.
"""
# 1. Generate video script from text content
# 2. Generate audio narration (persona voice if available)
# 3. Generate video with WAN 2.5
# 4. Optimize for LinkedIn (aspect ratio, duration)
# 5. Return video URL and metadata
pass
```
**Video Specifications for LinkedIn**:
- **Aspect Ratio**: 16:9 (landscape) or 9:16 (vertical)
- **Duration**: 15 seconds to 10 minutes
- **Resolution**: 720p minimum, 1080p recommended
- **Format**: MP4
- **Audio**: Synchronized narration, background music optional
**UI Component**: `frontend/src/components/LinkedInWriter/VideoPostCreator.tsx` (NEW)
**Features**:
- Text-to-video conversion
- Script editor with timing
- Video preview
- Resolution selection
- Duration control
- Cost estimation
---
#### 1.2 Avatar-Based Video Posts
**Feature**: Create video posts with user's avatar (from persona system)
**Use Cases**:
- Personal branding videos
- Consistent presence across posts
- Professional video messages
- Thought leadership content
**Implementation**:
**Integration with Persona System**:
```python
def generate_avatar_video_post(
user_id: str,
text_content: str,
use_persona_avatar: bool = True,
) -> bytes:
"""
Generate LinkedIn video post with user's avatar.
Uses Hunyuan Avatar or InfiniteTalk based on duration.
"""
# 1. Get user's persona
persona = get_persona(user_id)
# 2. Generate audio with persona voice
audio = generate_audio_with_persona_voice(text_content, persona)
# 3. Generate video with persona avatar
if duration <= 120: # 2 minutes
video = generate_with_hunyuan_avatar(persona.avatar_id, audio)
else: # Longer content
video = generate_with_infinitetalk(persona.avatar_id, audio)
return video
```
**UI Component**: `frontend/src/components/LinkedInWriter/AvatarVideoCreator.tsx` (NEW)
---
### 2. Enhanced Image Generation
#### 2.1 LinkedIn-Optimized Images
**Feature**: Generate professional images for LinkedIn posts
**Current State**: Basic image generation exists but limited
**Enhancements**:
- LinkedIn-specific image sizes
- Professional style optimization
- Brand consistency
- Multiple image options for A/B testing
**Implementation**:
**Backend**: `backend/api/linkedin_writer/image_generation.py` (ENHANCED)
```python
@router.post("/generate-post-image")
async def generate_linkedin_post_image(
request: LinkedInImageRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> LinkedInImageResponse:
"""
Generate LinkedIn-optimized image for post.
Uses Ideogram V3 Turbo for photorealistic images.
"""
# 1. Analyze post content for image context
# 2. Generate image prompt
# 3. Generate image with Ideogram
# 4. Optimize for LinkedIn (size, format)
# 5. Return image URL
pass
```
**Image Specifications**:
- **Sizes**:
- Post image: 1200x627px (1.91:1)
- Article cover: 1200x627px
- Carousel: 1080x1080px (1:1)
- **Format**: JPG or PNG
- **Style**: Professional, clean, brand-consistent
**UI Component**: `frontend/src/components/LinkedInWriter/ImageGenerator.tsx` (ENHANCED)
---
#### 2.2 Image-to-Video Conversion
**Feature**: Animate static images into video posts
**Use Cases**:
- Product showcases
- Before/after animations
- Infographic animations
- Portfolio presentations
**Implementation**:
**Backend Integration**:
```python
@router.post("/animate-image")
async def animate_linkedin_image(
request: LinkedInImageAnimationRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> LinkedInVideoResponse:
"""
Convert LinkedIn post image to animated video.
Uses WAN 2.5 image-to-video.
"""
# 1. Get uploaded image
# 2. Generate animation prompt
# 3. Use WAN 2.5 image-to-video
# 4. Add audio narration if provided
# 5. Return video
pass
```
---
### 3. Audio Content Integration
#### 3.1 Audio Narration for Posts
**Feature**: Add professional audio narration to LinkedIn posts
**Use Cases**:
- Audio versions of posts (accessibility)
- Podcast-style content
- Voice-over for videos
- Multilingual content
**Implementation**:
**Backend**: `backend/api/linkedin_writer/audio_generation.py` (NEW)
```python
@router.post("/generate-audio-narration")
async def generate_linkedin_audio(
request: LinkedInAudioRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> LinkedInAudioResponse:
"""
Generate audio narration for LinkedIn post.
Uses persona voice if available.
"""
# 1. Get user's persona
# 2. Generate audio with persona voice
# 3. Optimize for LinkedIn (duration, format)
# 4. Return audio URL
pass
```
**Audio Specifications**:
- **Format**: MP3
- **Duration**: Up to 10 minutes
- **Quality**: 128kbps minimum
- **Voice**: Persona voice (if trained) or professional TTS
---
### 4. Complete Multimedia Post Creation
#### 4.1 Unified Multimedia Post Creator
**Feature**: Create LinkedIn posts with text, image, video, and audio
**UI Component**: `frontend/src/components/LinkedInWriter/MultimediaPostCreator.tsx` (NEW)
**Workflow**:
```
1. User writes post content
2. System suggests multimedia options:
├─ Generate image
├─ Create video
├─ Add audio narration
└─ Animate image
3. User selects options
4. System generates multimedia content
5. User previews and edits
6. User publishes to LinkedIn
```
**Features**:
- Text editor with formatting
- Image generator with preview
- Video creator with script editor
- Audio narrator with voice selection
- Cost estimation for each option
- Preview before generation
- Batch generation for multiple posts
---
## Implementation Phases
### Phase 1: Video Post Creation (Week 1-3)
**Priority**: HIGH - Most engaging content type
**Tasks**:
1. ✅ Create video generation endpoint
2. ✅ Integrate WAN 2.5 for LinkedIn videos
3. ✅ Add video post creator UI
4. ✅ Implement script editor
5. ✅ Add video preview
6. ✅ Optimize for LinkedIn specs
7. ✅ Add cost estimation
8. ✅ Integrate with persona voice
9. ✅ Testing and optimization
**Files to Create**:
- `backend/api/linkedin_writer/video_generation.py`
- `frontend/src/components/LinkedInWriter/VideoPostCreator.tsx`
- `frontend/src/components/LinkedInWriter/VideoPreview.tsx`
**Files to Modify**:
- `backend/api/linkedin_writer/router.py`
- `frontend/src/components/LinkedInWriter/LinkedInWriter.tsx`
- `frontend/src/services/linkedinWriterApi.ts`
**Success Criteria**:
- Users can create video posts
- Videos optimized for LinkedIn
- Cost tracking accurate
- Good video quality
- Persona voice integration works
---
### Phase 2: Enhanced Image Generation (Week 4-5)
**Priority**: MEDIUM - Improves existing feature
**Tasks**:
1. ✅ Enhance image generation endpoint
2. ✅ Integrate Ideogram V3 Turbo
3. ✅ Add LinkedIn-specific image sizes
4. ✅ Improve image generation UI
5. ✅ Add image-to-video conversion
6. ✅ Add multiple image options
7. ✅ Brand consistency features
8. ✅ Testing and optimization
**Files to Create**:
- `frontend/src/components/LinkedInWriter/ImageGenerator.tsx` (enhanced)
- `frontend/src/components/LinkedInWriter/ImageToVideoConverter.tsx`
**Files to Modify**:
- `backend/api/linkedin_writer/image_generation.py`
- `frontend/src/components/LinkedInWriter/LinkedInWriter.tsx`
**Success Criteria**:
- High-quality LinkedIn images
- Multiple image options
- Image-to-video works
- Cost-effective
---
### Phase 3: Avatar Video Integration (Week 6-7)
**Priority**: HIGH - Personal branding differentiator
**Tasks**:
1. ✅ Integrate Hunyuan Avatar
2. ✅ Integrate InfiniteTalk
3. ✅ Create avatar video creator UI
4. ✅ Add persona avatar integration
5. ✅ Add video duration controls
6. ✅ Add preview and editing
7. ✅ Testing and optimization
**Files to Create**:
- `backend/api/linkedin_writer/avatar_video.py`
- `frontend/src/components/LinkedInWriter/AvatarVideoCreator.tsx`
**Files to Modify**:
- `backend/api/linkedin_writer/router.py`
- `frontend/src/components/LinkedInWriter/LinkedInWriter.tsx`
**Success Criteria**:
- Avatar videos work well
- Persona integration seamless
- Good video quality
- Cost tracking accurate
---
### Phase 4: Audio & Multimedia Integration (Week 8-9)
**Priority**: MEDIUM - Complete multimedia suite
**Tasks**:
1. ✅ Create audio generation endpoint
2. ✅ Integrate persona voice
3. ✅ Create unified multimedia creator
4. ✅ Add batch generation
5. ✅ Add cost optimization
6. ✅ Add analytics
7. ✅ Testing and polish
**Files to Create**:
- `backend/api/linkedin_writer/audio_generation.py`
- `frontend/src/components/LinkedInWriter/MultimediaPostCreator.tsx`
- `frontend/src/components/LinkedInWriter/AudioNarrator.tsx`
**Success Criteria**:
- Complete multimedia workflow
- All features integrated
- Good user experience
- Cost-effective
---
## Cost Management
### Video Generation Costs
**WAN 2.5 Text-to-Video**:
- 480p: $0.05/second
- 720p: $0.10/second
- 1080p: $0.15/second
**LinkedIn Video Optimization**:
- Default: 720p (good quality, cost-effective)
- Premium: 1080p (best quality)
- Typical post: 30-60 seconds = $3-9
**Avatar Videos**:
- Hunyuan Avatar: $0.15-0.30 per 5 seconds
- InfiniteTalk: $0.15-0.30 per 5 seconds (up to 10 minutes)
- Typical post: 60 seconds = $1.80-3.60
### Image Generation Costs
**Ideogram V3 Turbo**: ~$0.04-0.08 per image
**Multiple Options**: 3-5 images = $0.12-0.40
### Audio Generation Costs
**Persona Voice**: $0.02 per minute
**Typical Post**: 2-3 minutes = $0.04-0.06
### Cost Optimization Strategies
1. **Pre-Flight Validation**: Check costs before generation
2. **Resolution Selection**: Default to cost-effective options
3. **Batch Discounts**: Lower cost for multiple posts
4. **Usage Limits**: Per-tier limits to prevent waste
5. **Cost Estimates**: Show costs before generation
---
## LinkedIn Platform Optimization
### Video Best Practices
**LinkedIn Video Specifications**:
- **Maximum Duration**: 10 minutes
- **Recommended Duration**: 15-90 seconds for posts
- **Aspect Ratios**:
- 16:9 (landscape) - best for desktop
- 9:16 (vertical) - best for mobile
- 1:1 (square) - works for both
- **Resolution**: 720p minimum, 1080p recommended
- **File Size**: Up to 5GB
- **Format**: MP4 (H.264 codec)
**Optimization Features**:
- Auto-optimize for LinkedIn
- Aspect ratio selection
- Duration recommendations
- Thumbnail generation
- Caption/subtitle support
### Image Best Practices
**LinkedIn Image Specifications**:
- **Post Image**: 1200x627px (1.91:1)
- **Article Cover**: 1200x627px
- **Carousel**: 1080x1080px (1:1)
- **Profile Banner**: 1584x396px
- **Format**: JPG or PNG
- **File Size**: Up to 5MB
**Optimization Features**:
- Auto-resize for LinkedIn
- Format optimization
- Compression for web
- Multiple size options
---
## User Experience Flow
### Enhanced LinkedIn Writer Workflow
```
1. User opens LinkedIn Writer
2. User selects content type:
├─ Text Post
├─ Video Post
├─ Image Post
├─ Carousel Post
└─ Article
3. User writes content (or AI generates)
4. System suggests multimedia options:
├─ Generate professional image
├─ Create video with narration
├─ Add audio version
└─ Create avatar video
5. User selects multimedia options
6. System shows cost estimate
7. User approves and generates
8. User previews content
9. User edits if needed
10. User publishes to LinkedIn
```
### Multimedia Post Creator UI
**Layout**:
```
┌─────────────────────────────────────┐
│ LinkedIn Multimedia Post Creator │
├─────────────────────────────────────┤
│ │
│ [Text Editor] │
│ ┌─────────────────────────────┐ │
│ │ Write your post content... │ │
│ │ │ │
│ └─────────────────────────────┘ │
│ │
│ [Multimedia Options] │
│ ┌──────┐ ┌──────┐ ┌──────┐ │
│ │ Image│ │Video │ │Audio │ │
│ │ $0.1│ │ $3.00│ │ $0.05│ │
│ └──────┘ └──────┘ └──────┘ │
│ │
│ [Preview] │
│ ┌─────────────────────────────┐ │
│ │ [Generated Content Preview] │ │
│ └─────────────────────────────┘ │
│ │
│ [Cost Summary] │
│ Total: $3.15 │
│ │
│ [Generate] [Preview] [Publish] │
└─────────────────────────────────────┘
```
---
## Integration Points
### Persona System Integration
**Voice Integration**:
- Use persona voice for video narration
- Use persona voice for audio posts
- Consistent brand voice across content
**Avatar Integration**:
- Use persona avatar for video posts
- Consistent visual presence
- Professional branding
### Story Writer Integration
**Shared Services**:
- Video generation (WAN 2.5)
- Voice cloning (Minimax)
- Avatar generation (Hunyuan/InfiniteTalk)
- Image generation (Ideogram)
**Code Reuse**:
- Share video generation service
- Share audio generation service
- Share image generation service
- Unified cost tracking
---
## Success Metrics
### Engagement Metrics
- Video post engagement vs. text posts (target: 3x higher)
- Image post engagement vs. text posts (target: 2x higher)
- Multimedia post reach vs. text posts (target: 2.5x higher)
### Adoption Metrics
- Video post creation rate (target: >30% of users)
- Image generation usage (target: >60% of users)
- Avatar video usage (target: >20% of Pro users)
### Quality Metrics
- Video quality satisfaction (target: >4.5/5)
- Image quality satisfaction (target: >4.5/5)
- User satisfaction with multimedia features (target: >4.5/5)
### Business Metrics
- Premium tier conversion (multimedia as differentiator)
- User retention (multimedia users vs. text-only)
- Content generation volume (multimedia users create more)
---
## Risk Mitigation
| Risk | Mitigation |
|------|------------|
| High costs | Pre-flight validation, tier-based limits, cost estimates |
| Quality issues | Quality checks, preview before generation, regeneration option |
| LinkedIn API changes | Monitor LinkedIn updates, adapt quickly |
| User confusion | Clear UI, tooltips, tutorials, documentation |
| Performance issues | Optimize generation, queue system, background processing |
---
## Competitive Advantage
### Unique Features
1. **Complete Multimedia Suite**: Text + Image + Video + Audio in one tool
2. **Persona Integration**: Consistent brand voice and avatar
3. **LinkedIn Optimization**: Platform-specific optimizations
4. **Cost-Effective**: More affordable than competitors
5. **AI-Powered**: Automated content generation
### Market Position
- **vs. Canva**: More AI-powered, integrated with content generation
- **vs. Loom**: More features, LinkedIn-optimized, persona integration
- **vs. Descript**: More affordable, LinkedIn-focused, persona integration
---
## Next Steps
1. **Week 1**: Set up WaveSpeed API access for LinkedIn videos
2. **Week 1-2**: Implement video post generation
3. **Week 2-3**: Create video post creator UI
4. **Week 3-4**: Enhance image generation
5. **Week 4-5**: Integrate avatar videos
6. **Week 5-6**: Add audio narration
7. **Week 6-7**: Create unified multimedia creator
8. **Week 7-8**: Testing, optimization, and polish
---
*Document Version: 1.0*
*Last Updated: January 2025*
*Priority: HIGH - LinkedIn Engagement Driver*

View File

@@ -0,0 +1,615 @@
# Persona System: Voice Cloning & Avatar Hyper-Personalization
## Executive Summary
This document outlines the integration of voice cloning and AI avatar capabilities into ALwrity's Persona System to enable true hyper-personalization. Users will train their voice and create their avatar during onboarding, then use these across all content generation (LinkedIn, Blog, Story Writer, etc.) for consistent brand identity.
---
## Vision: AI Hyper-Personalization
**Goal**: Every piece of content generated by ALwrity should feel authentically "you" - not just in writing style, but in voice and visual presence.
**Current State**: Persona system handles writing style only
**Target State**: Persona system handles writing style + voice + avatar = complete brand identity
---
## Current Persona System Analysis
### Existing Capabilities
- **Writing Style Analysis**: Tone, voice, complexity, engagement level
- **Platform Adaptation**: LinkedIn, Facebook, Blog optimizations
- **Content Characteristics**: Sentence structure, vocabulary, patterns
- **Onboarding Integration**: Automatically generated from onboarding data
### Current Limitations
- No voice/personality in audio content
- No visual representation
- Limited to text-based personalization
- Cannot create video content with user's presence
### Persona System Architecture
**Location**: `backend/services/persona_analysis_service.py`
**Current Flow**:
1. User completes onboarding (6 steps)
2. System analyzes website content and writing style
3. Core persona generated
4. Platform-specific adaptations created
5. Persona saved to database
**Database Model**: `backend/models/persona_models.py` - `WritingPersona` table
---
## Proposed Enhancements
### 1. Voice Cloning Integration
#### 1.1 Voice Training During Onboarding
**Integration Point**: Onboarding Step 6 (Persona Generation)
**New Onboarding Flow**:
```
Step 1-5: Existing onboarding steps
Step 6: Persona Generation
├─ Writing Style Analysis (existing)
├─ Voice Training (NEW)
│ ├─ Audio sample upload (1-3 minutes)
│ ├─ Voice clone training (~2-5 minutes)
│ └─ Voice preview and approval
└─ Avatar Creation (NEW)
├─ Photo upload
├─ Avatar generation
└─ Avatar preview and approval
```
**Implementation**:
**Backend**: `backend/services/persona/voice_persona_service.py` (NEW)
```python
class VoicePersonaService:
"""
Manages voice cloning for persona system.
Integrates with Minimax voice clone API.
"""
def train_voice_from_audio(
self,
user_id: str,
audio_file_path: str,
persona_id: int,
) -> Dict[str, Any]:
"""
Train voice clone from user's audio sample.
Links voice to persona.
"""
# 1. Validate audio file (format, length, quality)
# 2. Upload to Minimax
# 3. Train voice clone
# 4. Store voice_id in persona
# 5. Return training status
pass
def generate_audio_with_persona_voice(
self,
text: str,
persona_id: int,
emotion: str = "neutral",
speed: float = 1.0,
) -> bytes:
"""
Generate audio using persona's cloned voice.
"""
# 1. Get voice_id from persona
# 2. Call Minimax voice generation
# 3. Return audio bytes
pass
```
**Database Schema Update**: `backend/models/persona_models.py`
```python
class WritingPersona(Base):
# Existing fields...
# NEW: Voice cloning fields
voice_id: Optional[str] = Column(String(255), nullable=True)
voice_training_status: Optional[str] = Column(String(50), nullable=True) # 'not_trained', 'training', 'ready', 'failed'
voice_training_audio_url: Optional[str] = Column(String(500), nullable=True)
voice_trained_at: Optional[datetime] = Column(DateTime, nullable=True)
# NEW: Avatar fields
avatar_id: Optional[str] = Column(String(255), nullable=True)
avatar_image_url: Optional[str] = Column(String(500), nullable=True)
avatar_training_status: Optional[str] = Column(String(50), nullable=True)
avatar_created_at: Optional[datetime] = Column(DateTime, nullable=True)
```
**Frontend**: `frontend/src/components/Onboarding/PersonaGenerationStep.tsx` (NEW)
```typescript
interface PersonaGenerationStepProps {
onboardingData: OnboardingData;
onComplete: (persona: Persona) => void;
}
const PersonaGenerationStep: React.FC<PersonaGenerationStepProps> = ({
onboardingData,
onComplete,
}) => {
// 1. Show writing style analysis progress
// 2. Show voice training section
// 3. Show avatar creation section
// 4. Preview complete persona
// 5. Allow approval/modification
};
```
#### 1.2 Voice Usage Across Platform
**Integration Points**:
- **Story Writer**: Use persona voice for audio narration
- **LinkedIn**: Voice-over for video posts
- **Blog**: Audio narration for blog posts
- **Email**: Personalized voice messages
- **Social Media**: Video content with user's voice
**Implementation Pattern**:
```python
# In any content generation service
def generate_content_with_persona(user_id: str, content_type: str):
# 1. Get user's persona
persona = get_persona(user_id)
# 2. Generate text content (existing)
text_content = generate_text(persona)
# 3. Generate audio with persona voice (NEW)
if persona.voice_id and persona.voice_training_status == 'ready':
audio_content = voice_service.generate_audio_with_persona_voice(
text=text_content,
persona_id=persona.id,
)
# 4. Generate video with persona avatar (NEW)
if persona.avatar_id:
video_content = avatar_service.generate_video_with_persona_avatar(
text=text_content,
audio=audio_content,
persona_id=persona.id,
)
return {
'text': text_content,
'audio': audio_content,
'video': video_content,
}
```
---
### 2. Avatar Creation Integration
#### 2.1 Avatar Training During Onboarding
**Integration Point**: Onboarding Step 6 (Persona Generation)
**Avatar Options**:
1. **Hunyuan Avatar**: Talking avatar from photo + audio
2. **InfiniteTalk**: Long-form avatar videos
3. **Custom Avatar**: User's photo as avatar base
**Implementation**:
**Backend**: `backend/services/persona/avatar_persona_service.py` (NEW)
```python
class AvatarPersonaService:
"""
Manages avatar creation for persona system.
Integrates with WaveSpeed Hunyuan Avatar and InfiniteTalk.
"""
def create_avatar_from_photo(
self,
user_id: str,
photo_file_path: str,
persona_id: int,
) -> Dict[str, Any]:
"""
Create avatar from user's photo.
Uses Hunyuan Avatar for initial creation.
"""
# 1. Validate photo (format, size, quality)
# 2. Upload to WaveSpeed
# 3. Create avatar
# 4. Store avatar_id in persona
# 5. Return avatar preview
pass
def generate_video_with_persona_avatar(
self,
text: str,
audio_bytes: bytes,
persona_id: int,
duration: int = 60, # seconds
) -> bytes:
"""
Generate video with persona's avatar speaking.
Uses InfiniteTalk for long-form, Hunyuan for short.
"""
# 1. Get avatar_id from persona
# 2. Get voice_id from persona (for audio)
# 3. Call WaveSpeed API
# 4. Return video bytes
pass
```
#### 2.2 Avatar Usage Across Platform
**Use Cases**:
- **LinkedIn Video Posts**: User's avatar presenting content
- **Story Writer**: Avatar narrating story scenes
- **Blog Videos**: Avatar explaining blog content
- **Email Campaigns**: Personalized video messages
- **Social Media**: Consistent avatar across platforms
---
### 3. Enhanced Persona Management
#### 3.1 Persona Dashboard
**New UI Component**: `frontend/src/components/Persona/PersonaDashboard.tsx`
**Features**:
- Persona overview (writing style, voice, avatar)
- Voice training status and preview
- Avatar preview and management
- Usage statistics (where persona is used)
- Edit/update options
#### 3.2 Persona Settings
**New UI Component**: `frontend/src/components/Persona/PersonaSettings.tsx`
**Settings**:
- Voice parameters (emotion, speed, tone)
- Avatar appearance (clothing, background, style)
- Platform-specific adaptations
- Content type preferences
---
## Implementation Phases
### Phase 1: Voice Cloning Integration (Week 1-3)
**Priority**: HIGH - Core hyper-personalization feature
**Tasks**:
1. ✅ Create `VoicePersonaService`
2. ✅ Integrate Minimax voice clone API
3. ✅ Add voice fields to `WritingPersona` model
4. ✅ Update onboarding Step 6 with voice training
5. ✅ Create voice training UI component
6. ✅ Add voice preview and testing
7. ✅ Integrate voice into Story Writer
8. ✅ Add voice usage tracking
9. ✅ Update persona dashboard
10. ✅ Testing and optimization
**Files to Create**:
- `backend/services/persona/voice_persona_service.py`
- `frontend/src/components/Onboarding/VoiceTrainingSection.tsx`
- `frontend/src/components/Persona/VoiceManagement.tsx`
**Files to Modify**:
- `backend/models/persona_models.py`
- `backend/services/persona_analysis_service.py`
- `backend/api/onboarding_utils/` (onboarding routes)
- `frontend/src/components/Onboarding/PersonaGenerationStep.tsx`
- `backend/services/story_writer/audio_generation_service.py`
**Success Criteria**:
- Users can train voice during onboarding
- Voice used automatically in Story Writer
- Voice quality significantly better than gTTS
- Voice linked to persona
- Cost tracking accurate
---
### Phase 2: Avatar Creation Integration (Week 4-6)
**Priority**: HIGH - Visual personalization
**Tasks**:
1. ✅ Create `AvatarPersonaService`
2. ✅ Integrate Hunyuan Avatar API
3. ✅ Add avatar fields to `WritingPersona` model
4. ✅ Update onboarding Step 6 with avatar creation
5. ✅ Create avatar creation UI component
6. ✅ Add avatar preview and testing
7. ✅ Integrate avatar into content generation
8. ✅ Add avatar usage tracking
9. ✅ Update persona dashboard
10. ✅ Testing and optimization
**Files to Create**:
- `backend/services/persona/avatar_persona_service.py`
- `frontend/src/components/Onboarding/AvatarCreationSection.tsx`
- `frontend/src/components/Persona/AvatarManagement.tsx`
**Files to Modify**:
- `backend/models/persona_models.py`
- `backend/services/persona_analysis_service.py`
- `frontend/src/components/Onboarding/PersonaGenerationStep.tsx`
- `backend/services/story_writer/video_generation_service.py`
**Success Criteria**:
- Users can create avatar during onboarding
- Avatar used in video content generation
- Avatar quality good
- Avatar linked to persona
- Cost tracking accurate
---
### Phase 3: Cross-Platform Integration (Week 7-8)
**Priority**: MEDIUM - Complete hyper-personalization
**Tasks**:
1. ✅ Integrate persona voice into LinkedIn Writer
2. ✅ Integrate persona avatar into LinkedIn Writer
3. ✅ Integrate persona voice into Blog Writer
4. ✅ Integrate persona avatar into Blog Writer
5. ✅ Add persona usage analytics
6. ✅ Update all content generation services
7. ✅ Create persona usage dashboard
8. ✅ Documentation and user guides
**Success Criteria**:
- Persona voice/avatar used across all platforms
- Consistent brand identity
- Good user experience
- Analytics working
---
## Cost Management
### Voice Cloning Costs
**One-Time Training**: $0.75 per voice
**Per-Minute Generation**: $0.02 per minute
**Cost Optimization**:
- Train voice once during onboarding (included in Pro/Enterprise)
- Free tier: gTTS only
- Basic tier: Voice training available ($0.75 one-time)
- Pro/Enterprise: Voice training included
### Avatar Creation Costs
**Hunyuan Avatar**: $0.15-0.30 per 5 seconds
**InfiniteTalk**: $0.15-0.30 per 5 seconds (up to 10 minutes)
**Cost Optimization**:
- Avatar creation: One-time during onboarding
- Video generation: Pay-per-use
- Default to shorter videos (5 seconds)
- Allow longer videos for premium users
### Subscription Integration
**Update Subscription Tiers**:
- **Free**: Writing persona only, no voice/avatar
- **Basic**: Writing persona + voice training ($0.75 one-time)
- **Pro**: Writing persona + voice + avatar creation included
- **Enterprise**: All features + unlimited usage
---
## User Experience Flow
### Onboarding Flow (Enhanced)
```
Step 1-5: Existing onboarding steps
Step 6: Persona Generation
├─ Writing Style Analysis
│ └─ [Progress: Analyzing your writing style...]
├─ Voice Training (NEW)
│ ├─ Upload audio sample (1-3 minutes)
│ ├─ [Training your voice...] (~2-5 minutes)
│ ├─ Preview generated voice
│ └─ Approve or retrain
└─ Avatar Creation (NEW)
├─ Upload photo
├─ [Creating your avatar...] (~1-2 minutes)
├─ Preview avatar
└─ Approve or recreate
Step 7: Persona Preview
├─ Writing Style Summary
├─ Voice Preview
├─ Avatar Preview
└─ Approve Complete Persona
```
### Content Generation Flow (Enhanced)
```
User creates content (LinkedIn/Blog/Story)
System loads user's persona
├─ Writing style → Text generation
├─ Voice ID → Audio generation (if available)
└─ Avatar ID → Video generation (if available)
Content generated with full personalization
├─ Text matches writing style
├─ Audio uses user's voice
└─ Video shows user's avatar
```
---
## Technical Architecture
### Backend Services
```
backend/services/
├── persona/
│ ├── __init__.py
│ ├── voice_persona_service.py # NEW: Voice cloning
│ ├── avatar_persona_service.py # NEW: Avatar creation
│ └── persona_analysis_service.py # Enhanced
├── minimax/
│ └── voice_clone.py # Shared with Story Writer
└── wavespeed/
└── avatar_generation.py # Shared with Story Writer
```
### Frontend Components
```
frontend/src/components/
├── Onboarding/
│ ├── PersonaGenerationStep.tsx # Enhanced
│ ├── VoiceTrainingSection.tsx # NEW
│ └── AvatarCreationSection.tsx # NEW
└── Persona/
├── PersonaDashboard.tsx # NEW
├── VoiceManagement.tsx # NEW
├── AvatarManagement.tsx # NEW
└── PersonaSettings.tsx # NEW
```
### Database Schema
```sql
-- Enhanced WritingPersona table
ALTER TABLE writing_persona ADD COLUMN voice_id VARCHAR(255);
ALTER TABLE writing_persona ADD COLUMN voice_training_status VARCHAR(50);
ALTER TABLE writing_persona ADD COLUMN voice_training_audio_url VARCHAR(500);
ALTER TABLE writing_persona ADD COLUMN voice_trained_at TIMESTAMP;
ALTER TABLE writing_persona ADD COLUMN avatar_id VARCHAR(255);
ALTER TABLE writing_persona ADD COLUMN avatar_image_url VARCHAR(500);
ALTER TABLE writing_persona ADD COLUMN avatar_training_status VARCHAR(50);
ALTER TABLE writing_persona ADD COLUMN avatar_created_at TIMESTAMP;
```
---
## Integration with Existing Systems
### Story Writer Integration
**Location**: `backend/services/story_writer/audio_generation_service.py`
**Enhancement**:
```python
def generate_scene_audio(
self,
scene: Dict[str, Any],
user_id: str,
use_persona_voice: bool = True, # NEW: Use persona voice
) -> Dict[str, Any]:
if use_persona_voice:
# Get user's persona
persona = get_persona(user_id)
if persona.voice_id and persona.voice_training_status == 'ready':
# Use persona voice
return self._generate_with_persona_voice(scene, persona)
# Fallback to default provider
return self._generate_with_gtts(scene)
```
### LinkedIn Writer Integration
**Enhancement**: Add video generation with persona avatar
- LinkedIn video posts with user's avatar
- Voice-over with user's voice
- Consistent brand presence
### Blog Writer Integration
**Enhancement**: Add audio/video options
- Audio narration with persona voice
- Video explanations with persona avatar
- Enhanced blog content
---
## Success Metrics
### Adoption Metrics
- Voice training completion rate (target: >60% of Pro users)
- Avatar creation completion rate (target: >50% of Pro users)
- Persona usage across platforms (target: >80% of content uses persona)
### Quality Metrics
- Voice quality satisfaction (target: >4.5/5)
- Avatar quality satisfaction (target: >4.5/5)
- Brand consistency score (target: >90%)
### Business Metrics
- User retention (persona users vs. non-persona)
- Content engagement (persona content vs. generic)
- Premium tier conversion (persona as differentiator)
---
## Risk Mitigation
| Risk | Mitigation |
|------|------------|
| Voice training failure | Quality checks, clear error messages, retry option |
| Avatar quality issues | Preview before approval, regeneration option |
| Cost concerns | Clear pricing, tier-based access, cost estimates |
| User privacy | Secure storage, opt-in consent, data encryption |
| API reliability | Fallback options, retry logic, error handling |
---
## Privacy & Security
### Data Storage
- Voice samples: Encrypted storage, deleted after training
- Avatar photos: Encrypted storage, user can delete
- Voice/Avatar IDs: Secure API keys, no raw data stored
### User Control
- Users can delete voice/avatar anytime
- Users can retrain voice/avatar
- Users can opt-out of voice/avatar features
- Clear privacy policy
---
## Next Steps
1. **Week 1**: Set up Minimax API access
2. **Week 1-2**: Implement voice persona service
3. **Week 2-3**: Integrate into onboarding
4. **Week 3-4**: Integrate into Story Writer
5. **Week 4-5**: Set up WaveSpeed avatar API
6. **Week 5-6**: Implement avatar persona service
7. **Week 6-7**: Integrate into onboarding
8. **Week 7-8**: Cross-platform integration
---
*Document Version: 1.0*
*Last Updated: January 2025*
*Priority: HIGH - Core Hyper-Personalization Feature*

View File

@@ -0,0 +1,834 @@
# Story Writer Video Generation Enhancement Plan
## Executive Summary
This document outlines the immediate enhancement plan for ALwrity's Story Writer to replace problematic HuggingFace video generation with WaveSpeed AI models and upgrade basic gTTS audio to professional voice cloning. This provides immediate value to users while solving current technical issues.
---
## Current State Analysis
### Current Video Generation
- **Provider**: HuggingFace (tencent/HunyuanVideo via fal-ai)
- **Issues**:
- Unreliable API responses
- Limited quality control
- No audio synchronization
- Single provider dependency
- Poor error handling
### Current Audio Generation
- **Provider**: gTTS (Google Text-to-Speech)
- **Limitations**:
- Robotic, non-natural voice
- No brand voice consistency
- Limited language options
- No emotion control
- Cannot clone user's voice
### Current Story Writer Workflow
1. User creates story outline with scenes
2. Each scene has `audio_narration` text
3. Audio generated via gTTS per scene
4. Video generated via HuggingFace per scene
5. Videos compiled into final story video
**Location**: `backend/api/story_writer/` and `frontend/src/components/StoryWriter/`
---
## Proposed Enhancements
### Core Principles
**Provider Abstraction**:
- Users should NOT see provider names (HuggingFace, WaveSpeed, etc.)
- All provider routing/switching happens automatically in the background
- Users only see user-friendly options like "Standard Quality" or "Premium Quality"
- System automatically selects best available provider based on user's subscription and credits
**Preserve Existing Options**:
- gTTS remains available as free fallback when credits run out
- HuggingFace remains available as fallback option
- All existing functionality preserved
- New features are additions, not replacements
**Cost Transparency**:
- All buttons show cost information in tooltips
- Users make informed decisions before generating
- No surprise costs
---
### 1. Provider-Agnostic Video Generation System
#### 1.1 Smart Provider Routing
**Backend Implementation** (`backend/services/llm_providers/main_video_generation.py`):
```python
def ai_video_generate(
prompt: str,
quality: str = "standard", # "standard" (480p), "high" (720p), "premium" (1080p)
duration: int = 5,
audio_file_path: Optional[str] = None,
user_id: str,
**kwargs,
) -> bytes:
"""
Unified video generation entry point.
Automatically routes to best available provider:
- WaveSpeed WAN 2.5 (primary, if credits available)
- HuggingFace (fallback, if WaveSpeed unavailable)
Users never see provider names - only quality options.
"""
# 1. Check user subscription and credits
# 2. Select best available provider automatically
# 3. Route to appropriate provider function
# 4. Handle fallbacks transparently
pass
def _select_video_provider(
user_id: str,
quality: str,
pricing_service: PricingService,
) -> Tuple[str, str]:
"""
Automatically select best video provider.
Returns: (provider_name, model_name)
Selection logic:
1. Check user credits/subscription
2. Prefer WaveSpeed if available and credits sufficient
3. Fallback to HuggingFace if WaveSpeed unavailable
4. Return error if no providers available
"""
# Implementation details...
```
**Key Features**:
- Automatic provider selection (users don't choose)
- Seamless fallback between providers
- Quality-based options (Standard/High/Premium) instead of provider names
- Cost-aware routing (uses cheapest available option)
- Transparent error handling
**Quality Mapping**:
- **Standard Quality** (480p): $0.05/second - Uses WaveSpeed 480p or HuggingFace
- **High Quality** (720p): $0.10/second - Uses WaveSpeed 720p
- **Premium Quality** (1080p): $0.15/second - Uses WaveSpeed 1080p
**Cost Optimization**:
- Default to Standard Quality (480p) for cost-effectiveness
- Allow upgrade to High/Premium for final export
- Pre-flight validation prevents waste
- Automatic fallback to free options when credits exhausted
---
### 2. Enhanced Audio Generation with Voice Cloning
#### 2.1 User-Friendly Voice Selection
**Key Principle**: Users choose between "AI Clone Voice" or "Default Voice" (gTTS) - no provider names shown.
**Backend Implementation** (`backend/services/story_writer/audio_generation_service.py`):
```python
class StoryAudioGenerationService:
def generate_scene_audio(
self,
scene: Dict[str, Any],
user_id: str,
use_ai_voice: bool = False, # User's choice: AI Clone or Default
**kwargs,
) -> Dict[str, Any]:
"""
Generate audio with automatic provider selection.
If use_ai_voice=True:
- Try persona voice clone (if trained)
- Try Minimax voice clone (if credits available)
- Fallback to gTTS if no credits
If use_ai_voice=False:
- Use gTTS (always free, always available)
"""
if use_ai_voice:
# Try AI voice options
if self._has_persona_voice(user_id):
return self._generate_with_persona_voice(scene, user_id)
elif self._has_credits_for_voice_clone(user_id):
return self._generate_with_minimax_voice_clone(scene, user_id)
else:
# Fallback to gTTS with notification
logger.info(f"Credits exhausted, falling back to gTTS for user {user_id}")
return self._generate_with_gtts(scene, **kwargs)
else:
# User explicitly chose default voice
return self._generate_with_gtts(scene, **kwargs)
```
**Voice Options in Story Setup**:
- **Default Voice (gTTS)**: Free, always available, robotic but functional
- **AI Clone Voice**: Natural, human-like, requires credits ($0.02/minute)
**Cost Considerations**:
- Voice training: One-time cost (~$0.75) - only if user wants to train custom voice
- Voice generation: ~$0.02 per minute (only when AI Clone Voice selected)
- gTTS: Always free, always available as fallback
- Automatic fallback to gTTS when credits exhausted (with user notification)
---
### 3. Enhanced Story Setup UI
#### 3.1 Video Generation Settings (Provider-Agnostic)
**Location**: `frontend/src/components/StoryWriter/Phases/StorySetup/GenerationSettingsSection.tsx`
**User-Friendly Settings** (No Provider Names):
```typescript
interface VideoGenerationSettings {
// Quality selection (NOT provider selection)
videoQuality: 'standard' | 'high' | 'premium'; // Maps to 480p/720p/1080p
// Duration
videoDuration: 5 | 10; // seconds
// Cost estimation (shown in tooltip)
estimatedCostPerScene: number;
totalEstimatedCost: number;
// Provider routing happens automatically in backend
// Users never see "WaveSpeed" or "HuggingFace"
}
```
**UI Components**:
- Quality selector: "Standard" / "High" / "Premium" (with cost in tooltip)
- Duration selector: 5s (default) / 10s (premium)
- Cost tooltip: Shows estimated cost per scene and total
- Pre-flight validation warnings
- **No provider selector** - routing is automatic
**Tooltip Example**:
```
Standard Quality (480p)
├─ Cost: $0.25 per scene (5 seconds)
├─ Quality: Good for previews and testing
└─ Provider: Automatically selected based on credits
```
#### 3.2 Audio Generation Settings (Simple Choice)
**New Settings**:
```typescript
interface AudioGenerationSettings {
// Simple user choice - no provider names
voiceType: 'default' | 'ai_clone'; // "Default Voice" or "AI Clone Voice"
// Only shown if ai_clone selected
voiceTrainingStatus: 'not_trained' | 'training' | 'ready' | 'failed';
// Existing gTTS settings (preserved)
audioLang: string;
audioSlow: boolean;
audioRate: number;
}
```
**UI Components**:
- **Voice Type Selector**:
- "Default Voice (gTTS)" - Free, always available
- "AI Clone Voice" - Natural, $0.02/minute (with cost tooltip)
- Voice training section (only if AI Clone Voice selected)
- Existing gTTS settings (preserved for Default Voice)
- Cost per minute display in tooltip
**Tooltip for "AI Clone Voice"**:
```
AI Clone Voice
├─ Cost: $0.02 per minute
├─ Quality: Natural, human-like narration
├─ Fallback: Automatically uses Default Voice if credits exhausted
└─ Training: One-time $0.75 to train your custom voice (optional)
```
**Tooltip for "Default Voice"**:
```
Default Voice (gTTS)
├─ Cost: Free
├─ Quality: Standard text-to-speech
└─ Always Available: Works even when credits exhausted
```
---
### 4. New "Animate Scene" Feature in Outline Phase
#### 4.1 Per-Scene Animation Preview
**Location**: `frontend/src/components/StoryWriter/Phases/StoryOutline.tsx`
**Feature**: Add "Animate Scene" hover option alongside existing scene actions
**Implementation**:
- Add to `OutlineHoverActions` component
- Appears on hover over scene cards
- Only generates for single scene (never bulk)
- Uses cheapest option (480p/Standard Quality) to give users a feel
- Shows cost in tooltip before generation
**UI Component**:
```typescript
// In OutlineHoverActions.tsx
const sceneHoverActions = [
// Existing actions...
{
icon: <PlayArrowIcon />,
label: 'Animate Scene',
action: 'animate-scene',
tooltip: `Animate this scene with video\nCost: ~$0.25 (5 seconds, Standard Quality)\nPreview only - uses cheapest option`,
onClick: handleAnimateScene,
},
];
```
**Backend Endpoint**:
```python
@router.post("/animate-scene-preview")
async def animate_scene_preview(
request: SceneAnimationRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> SceneAnimationResponse:
"""
Generate preview animation for a single scene.
Always uses cheapest option (480p/Standard Quality).
Per-scene only - never bulk generation.
"""
# 1. Validate single scene only
# 2. Use Standard Quality (480p) - cheapest option
# 3. Generate video with automatic provider routing
# 4. Return preview video URL
pass
```
**Cost Management**:
- Always uses Standard Quality (480p) - $0.25 per scene
- Pre-flight validation before generation
- Clear cost display in tooltip
- Per-scene only prevents bulk waste
---
### 5. New "Animate Story with VoiceOver" Button in Writing Phase
#### 5.1 Complete Story Animation
**Location**: `frontend/src/components/StoryWriter/Phases/StoryWriting.tsx`
**Feature**: New button alongside existing HuggingFace video options
**Implementation**:
- Add button in Writing phase toolbar
- Generates complete animated story with synchronized voiceover
- Uses user's voice preference from Setup (AI Clone or Default)
- Shows comprehensive cost breakdown in tooltip
- Pre-flight validation before generation
**UI Component**:
```typescript
<Button
variant="contained"
startIcon={<SmartDisplayIcon />}
onClick={handleAnimateStoryWithVoiceOver}
disabled={!state.storyContent || isGenerating}
title={`Animate Story with VoiceOver\n\nCost Breakdown:\n- Video: $${videoCost} (${scenes.length} scenes × $${costPerScene})\n- Audio: $${audioCost} (${totalAudioMinutes} minutes)\n- Total: $${totalCost}\n\nQuality: ${state.videoQuality}\nVoice: ${state.voiceType === 'ai_clone' ? 'AI Clone' : 'Default'}`}
>
Animate Story with VoiceOver
</Button>
```
**Backend Endpoint**:
```python
@router.post("/animate-story-with-voiceover")
async def animate_story_with_voiceover(
request: StoryAnimationRequest,
current_user: Dict[str, Any] = Depends(get_current_user),
) -> StoryAnimationResponse:
"""
Generate complete animated story with synchronized voiceover.
Uses user's quality and voice preferences from Setup.
"""
# 1. Pre-flight validation (cost, credits, limits)
# 2. Generate audio for all scenes (using user's voice preference)
# 3. Generate videos for all scenes (using user's quality preference)
# 4. Synchronize audio with video
# 5. Compile into final story video
# 6. Return video URL and cost breakdown
pass
```
**Cost Tooltip Example**:
```
Animate Story with VoiceOver
Cost Breakdown:
├─ Video (Standard Quality): $2.50
│ └─ 10 scenes × $0.25 per scene
├─ Audio (AI Clone Voice): $1.00
│ └─ 50 minutes total × $0.02/minute
└─ Total: $3.50
Settings:
├─ Quality: Standard (480p)
├─ Voice: AI Clone Voice
└─ Duration: 5 seconds per scene
⚠️ This will use $3.50 of your monthly credits
```
---
## Implementation Phases
### Phase 1: Provider-Agnostic Video System (Week 1-2)
**Priority**: HIGH - Solves immediate HuggingFace issues with provider abstraction
**Tasks**:
1. ✅ Create WaveSpeed API client (`backend/services/wavespeed/client.py`)
2. ✅ Add WAN 2.5 text-to-video function
3. ✅ Implement smart provider routing in `main_video_generation.py`
4. ✅ Add quality-based selection (Standard/High/Premium)
5. ✅ Preserve HuggingFace as fallback option
6. ✅ Update `hd_video.py` with provider routing
7. ✅ Add pre-flight cost validation
8. ✅ Update frontend with quality selector (remove provider names)
9. ✅ Add cost tooltips to all buttons
10. ✅ Update subscription limits
11. ✅ Testing and error handling
**Files to Modify**:
- `backend/services/llm_providers/main_video_generation.py` (add routing logic)
- `backend/api/story_writer/utils/hd_video.py` (use quality-based API)
- `backend/api/story_writer/routes/video_generation.py`
- `frontend/src/components/StoryWriter/Phases/StorySetup/GenerationSettingsSection.tsx` (quality selector)
- `frontend/src/components/StoryWriter/components/HdVideoSection.tsx`
- `backend/services/subscription/pricing_service.py`
**Success Criteria**:
- Video generation works reliably with automatic provider routing
- Users see quality options, not provider names
- HuggingFace preserved as fallback
- Cost tracking accurate
- Pre-flight validation prevents waste
- Error messages clear and actionable
---
### Phase 2: Voice Cloning Integration (Week 3-4)
**Priority**: MEDIUM - Enhances audio quality with simple user choice
**Tasks**:
1. ✅ Create Minimax API client (`backend/services/minimax/voice_clone.py`)
2. ✅ Add voice training endpoint
3. ✅ Add voice generation endpoint
4. ✅ Update `audio_generation_service.py` with "AI Clone" vs "Default" logic
5. ✅ Preserve gTTS as always-available fallback
6. ✅ Add automatic fallback when credits exhausted
7. ✅ Update Story Setup with simple voice type selector
8. ✅ Add cost tooltips to voice options
9. ✅ Add voice preview and testing (if AI Clone selected)
10. ✅ Ensure gTTS always works even when credits exhausted
**Files to Create**:
- `backend/services/minimax/voice_clone.py`
- `backend/services/story_writer/voice_management_service.py`
**Files to Modify**:
- `backend/services/story_writer/audio_generation_service.py` (add voice type logic)
- `frontend/src/components/StoryWriter/Phases/StorySetup/GenerationSettingsSection.tsx` (voice type selector)
- `backend/models/story_models.py` (add voice type field)
**Success Criteria**:
- Users see simple choice: "Default Voice" or "AI Clone Voice"
- gTTS always available as fallback
- Automatic fallback when credits exhausted
- Cost tracking accurate
- Voice quality significantly better than gTTS when AI Clone used
---
### Phase 3: New Features - Animate Scene & Animate Story (Week 5-6)
**Priority**: MEDIUM - Add preview and complete animation features
**Tasks**:
1. ✅ Add "Animate Scene" hover option in Outline phase
2. ✅ Implement per-scene animation preview (cheapest option only)
3. ✅ Add "Animate Story with VoiceOver" button in Writing phase
4. ✅ Implement complete story animation with voiceover
5. ✅ Add comprehensive cost tooltips to all buttons
6. ✅ Add pre-flight validation for all animation features
7. ✅ Ensure per-scene only (no bulk generation in Outline)
8. ✅ Update documentation
9. ✅ User testing and feedback
**Files to Create**:
- `backend/api/story_writer/routes/scene_animation.py` (new endpoint)
- `frontend/src/components/StoryWriter/components/AnimateSceneButton.tsx`
**Files to Modify**:
- `frontend/src/components/StoryWriter/Phases/StoryOutlineParts/OutlineHoverActions.tsx` (add Animate Scene)
- `frontend/src/components/StoryWriter/Phases/StoryWriting.tsx` (add Animate Story button)
- `backend/api/story_writer/routes/video_generation.py` (add story animation endpoint)
**Success Criteria**:
- "Animate Scene" works in Outline (per-scene, cheapest option)
- "Animate Story with VoiceOver" works in Writing phase
- All buttons show cost in tooltips
- Pre-flight validation prevents waste
- Good user experience
---
### Phase 4: Integration & Optimization (Week 7-8)
**Priority**: MEDIUM - Polish and optimize
**Tasks**:
1. ✅ Integrate audio with video (synchronized videos)
2. ✅ Improve error handling and retry logic
3. ✅ Add progress indicators
4. ✅ Optimize cost calculations
5. ✅ Add usage analytics
6. ✅ Update documentation
7. ✅ User testing and feedback
**Success Criteria**:
- Smooth end-to-end workflow
- Cost-effective for users
- Reliable generation
- Excellent user experience
- All features work seamlessly together
---
## Cost Management & Prevention of Waste
### Pre-Flight Validation
**Implementation**: `backend/services/subscription/preflight_validator.py`
**Checks Before Generation**:
1. User has sufficient subscription tier
2. Estimated cost within monthly budget
3. Video generation limit not exceeded
4. Audio generation limit not exceeded
5. Total story cost reasonable (<$5 for typical story)
**Validation Flow**:
```python
def validate_story_generation(
pricing_service: PricingService,
user_id: str,
num_scenes: int,
video_resolution: str,
video_duration: int,
use_voice_clone: bool,
) -> Tuple[bool, str, Dict[str, Any]]:
"""
Pre-flight validation before story generation.
Returns: (allowed, message, cost_breakdown)
"""
# Calculate estimated costs
video_cost_per_scene = get_wavespeed_cost(video_resolution, video_duration)
audio_cost_per_scene = get_voice_clone_cost() if use_voice_clone else 0.0
total_estimated_cost = (video_cost_per_scene + audio_cost_per_scene) * num_scenes
# Check limits
limits = pricing_service.get_user_limits(user_id)
current_usage = pricing_service.get_current_usage(user_id)
# Validation logic...
return (allowed, message, cost_breakdown)
```
### Cost Estimation Display
**Frontend Implementation**:
- Real-time cost calculator in Story Setup
- Per-scene cost breakdown
- Total story cost estimate
- Monthly budget remaining
- Warning if approaching limits
**UI Example**:
```
Video Generation Cost Estimate:
├─ Resolution: 720p ($0.10/second)
├─ Duration: 5 seconds per scene
├─ Scenes: 10
└─ Total: $5.00
Audio Generation Cost Estimate:
├─ Provider: Voice Clone ($0.02/minute)
├─ Average: 30 seconds per scene
├─ Scenes: 10
└─ Total: $1.00
Total Estimated Cost: $6.00
Monthly Budget Remaining: $44.00
```
### Usage Tracking
**Enhanced Tracking**:
- Track video generation per scene
- Track audio generation per scene
- Track total story cost
- Alert users approaching limits
- Provide cost breakdown in analytics
---
## Pricing Integration
### WaveSpeed WAN 2.5 Pricing
**Add to `pricing_service.py`**:
```python
# WaveSpeed WAN 2.5 Text-to-Video
{
"provider": APIProvider.VIDEO, # Or new WAVESPEED provider
"model_name": "wan-2.5-480p",
"cost_per_second": 0.05,
"description": "WaveSpeed WAN 2.5 Text-to-Video (480p)"
},
{
"provider": APIProvider.VIDEO,
"model_name": "wan-2.5-720p",
"cost_per_second": 0.10,
"description": "WaveSpeed WAN 2.5 Text-to-Video (720p)"
},
{
"provider": APIProvider.VIDEO,
"model_name": "wan-2.5-1080p",
"cost_per_second": 0.15,
"description": "WaveSpeed WAN 2.5 Text-to-Video (1080p)"
}
```
### Minimax Voice Clone Pricing
**Add to `pricing_service.py`**:
```python
# Minimax Voice Clone
{
"provider": APIProvider.AUDIO, # New provider type
"model_name": "minimax-voice-clone-train",
"cost_per_request": 0.75, # One-time training cost
"description": "Minimax Voice Clone Training"
},
{
"provider": APIProvider.AUDIO,
"model_name": "minimax-voice-clone-generate",
"cost_per_minute": 0.02, # Per minute of generated audio
"description": "Minimax Voice Clone Generation"
}
```
### Subscription Tier Limits
**Update subscription limits**:
- **Free**: 3 stories/month, 480p only, gTTS only
- **Basic**: 10 stories/month, up to 720p, voice clone available
- **Pro**: 50 stories/month, up to 1080p, voice clone included
- **Enterprise**: Unlimited, all features
---
## Technical Architecture
### Backend Services
```
backend/services/
├── wavespeed/
│ ├── __init__.py
│ ├── client.py # WaveSpeed API client
│ ├── wan25_video.py # WAN 2.5 video generation
│ └── models.py # Request/response models
├── minimax/
│ ├── __init__.py
│ ├── client.py # Minimax API client
│ ├── voice_clone.py # Voice cloning service
│ └── models.py
└── story_writer/
├── audio_generation_service.py # Updated with voice clone
└── video_generation_service.py # Updated with WaveSpeed
```
### Frontend Components
```
frontend/src/components/StoryWriter/
├── Phases/StorySetup/
│ └── GenerationSettingsSection.tsx # Enhanced with new settings
├── components/
│ ├── HdVideoSection.tsx # Updated for WaveSpeed
│ ├── VoiceTrainingSection.tsx # NEW: Voice training UI
│ └── CostEstimationDisplay.tsx # NEW: Cost calculator
└── hooks/
└── useStoryGenerationCost.ts # NEW: Cost calculation hook
```
---
## Error Handling & User Experience
### Error Scenarios
1. **WaveSpeed API Failure**:
- Retry with exponential backoff (3 attempts)
- Fallback to HuggingFace if available
- Clear error message with cost refund notice
2. **Voice Clone Training Failure**:
- Provide specific error (audio quality, length, format)
- Suggest improvements
- Allow retry with different audio
3. **Cost Limit Exceeded**:
- Pre-flight validation prevents this
- Show upgrade prompt
- Suggest reducing scenes/resolution
4. **Audio/Video Mismatch**:
- Validate audio length matches video duration
- Auto-trim or extend audio
- Warn user before generation
### User Feedback
- Progress indicators for all operations
- Clear cost breakdowns
- Quality previews before final generation
- Regeneration options with cost tracking
- Usage analytics dashboard
---
## Testing Plan
### Unit Tests
- WaveSpeed API client
- Voice clone service
- Cost calculation
- Pre-flight validation
### Integration Tests
- End-to-end story generation
- Audio + video synchronization
- Error handling and fallbacks
- Subscription limit enforcement
### User Acceptance Tests
- Story generation workflow
- Voice training process
- Cost estimation accuracy
- Error recovery
---
## Success Metrics
### Technical Metrics
- Video generation success rate >95%
- Audio generation success rate >98%
- Average generation time per scene <30s
- API error rate <2%
### Business Metrics
- User satisfaction with video quality
- Cost per story (target: <$5 for 10-scene story)
- Voice clone adoption rate
- Story completion rate
### User Experience Metrics
- Time to generate story
- Error recovery time
- User understanding of costs
- Feature discovery rate
---
## Provider Management Strategy
### Always-Available Options
- **gTTS**: Always available, always free, works even when credits exhausted
- **HuggingFace**: Preserved as fallback option, works when WaveSpeed unavailable
### Automatic Provider Routing
- **Primary**: WaveSpeed WAN 2.5 (when credits available)
- **Fallback**: HuggingFace (when WaveSpeed unavailable or credits exhausted)
- **Audio Fallback**: gTTS (always available, always free)
### User Experience
- Users never see provider names
- System automatically selects best available option
- Seamless fallback when credits exhausted
- Clear notifications when fallback occurs
- No user intervention required
### No Deprecation
- **HuggingFace**: Kept as permanent fallback option
- **gTTS**: Kept as permanent free option
- All existing functionality preserved
- New features are additions, not replacements
---
## Next Steps
1. **Week 1**: Set up WaveSpeed API access and credentials
2. **Week 1**: Implement provider-agnostic routing system
3. **Week 2**: Integrate into Story Writer with quality-based UI
4. **Week 3**: Implement voice cloning with simple "AI Clone" vs "Default" choice
5. **Week 4**: Add voice training UI (only if AI Clone selected)
6. **Week 5**: Add "Animate Scene" hover option in Outline
7. **Week 6**: Add "Animate Story with VoiceOver" button in Writing
8. **Week 7-8**: Testing, optimization, and polish
## Key Design Principles
1. **Provider Abstraction**: Users never see provider names - only quality/voice options
2. **Preserve Existing**: gTTS and HuggingFace remain available as fallbacks
3. **Cost Transparency**: All buttons show costs in tooltips
4. **Automatic Fallback**: System automatically uses free options when credits exhausted
5. **Per-Scene Only**: Outline phase only allows per-scene generation (no bulk)
6. **User-Friendly**: Simple choices like "Standard Quality" not "WaveSpeed 480p"
---
## Risk Mitigation
| Risk | Mitigation |
|------|------------|
| WaveSpeed API changes | Version pinning, abstraction layer |
| Cost overruns | Strict pre-flight validation |
| Voice quality issues | Quality checks, fallback options |
| User confusion | Clear UI, tooltips, documentation |
| Integration complexity | Phased rollout, extensive testing |
---
*Document Version: 1.0*
*Last Updated: January 2025*
*Priority: HIGH - Immediate Implementation*

View File

@@ -0,0 +1,516 @@
# WaveSpeed AI Models Integration: Feature Proposal for ALwrity
## Executive Summary
This document outlines strategic feature enhancements for ALwrity's AI digital marketing platform by integrating advanced AI models from WaveSpeed.ai. These integrations will expand ALwrity's content creation capabilities from text-based content to comprehensive multimedia marketing solutions, positioning ALwrity as a complete end-to-end marketing content platform.
---
## Current ALwrity Capabilities
### Existing Features
- **Text Content Generation**: Blog posts, LinkedIn content, Facebook posts
- **SEO Dashboard**: Comprehensive SEO analysis and optimization
- **Content Strategy**: AI-powered persona development and content calendars
- **Story Writer**: Multi-phase story generation with basic video/image/audio
- **Image Generation**: Stability AI, Gemini, HuggingFace (text-to-image)
- **Video Generation**: Basic text-to-video via HuggingFace (tencent/HunyuanVideo)
### Current Limitations
- Limited video quality options (single provider)
- No audio-synchronized video generation
- No avatar/lipsync capabilities
- Basic image generation (no advanced creative options)
- No voice cloning for personalized audio
- Limited multilingual video content support
---
## Proposed New Features from WaveSpeed Models
### 1. **Advanced Video Content Creation Suite**
#### 1.1 Alibaba WAN 2.5 Text-to-Video
**Model**: `alibaba/wan-2.5/text-to-video`
**Capabilities**:
- Generate 480p/720p/1080p videos from text prompts
- Synchronized audio/voiceover generation
- Automatic lip-sync for generated speech
- Multilingual support (including Chinese)
- Up to 10 seconds duration
- 6 aspect ratio/size options
- Custom audio upload support (3-30 seconds, wav/mp3, ≤15MB)
**ALwrity Marketing Use Cases**:
- **Product Demo Videos**: Create professional product demonstration videos from product descriptions
- **Social Media Shorts**: Generate engaging short-form video content for TikTok, Instagram Reels, YouTube Shorts
- **Educational Content**: Transform blog posts into video tutorials with synchronized narration
- **Promotional Videos**: Create marketing videos with custom voiceovers for campaigns
- **Multilingual Marketing**: Generate video content in multiple languages for global campaigns
- **LinkedIn Video Posts**: Professional video content optimized for LinkedIn engagement
**Integration Points**:
- Extend existing Story Writer video generation
- New "Video Content Creator" module in main dashboard
- Integration with Blog Writer to convert articles to videos
- Social media content calendar with video suggestions
**Pricing Alignment**:
- 480p: $0.05/second
- 720p: $0.10/second
- 1080p: $0.15/second
- More affordable than Google Veo3, making it accessible for solopreneurs
---
#### 1.2 Alibaba WAN 2.5 Image-to-Video
**Model**: `alibaba/wan-2.5/image-to-video`
**Capabilities**:
- Convert static images to dynamic videos
- Add synchronized audio/voiceover
- Maintain image consistency while adding motion
- Same resolution and duration options as text-to-video
**ALwrity Marketing Use Cases**:
- **Product Showcase**: Animate product images for e-commerce
- **Portfolio Enhancement**: Transform static portfolio images into dynamic presentations
- **Social Media Content**: Repurpose existing images into engaging video content
- **Email Marketing**: Create animated product images for email campaigns
- **Website Hero Videos**: Convert hero images into dynamic background videos
- **Before/After Animations**: Create engaging transformation videos
**Integration Points**:
- Connect with existing image generation service
- "Animate Image" feature in image gallery
- Bulk image-to-video conversion for content libraries
- Integration with LinkedIn image posts
---
### 2. **AI Avatar & Personalization Suite**
#### 2.1 Hunyuan Avatar - Audio-Driven Talking Avatars
**Model**: `wavespeed-ai/hunyuan-avatar`
**Capabilities**:
- Create talking/singing avatars from single image + audio
- 480p/720p resolution
- Up to 120 seconds duration
- Character consistency preservation
- Emotion-controllable animations
- Multi-character dialogue support
- High-fidelity lip-sync
**ALwrity Marketing Use Cases**:
- **Personal Branding**: Create personalized video messages from founder/CEO photos
- **Customer Service Videos**: Generate FAQ videos with company spokesperson avatar
- **Training Content**: Create educational videos with consistent instructor avatar
- **Product Explainer Videos**: Use product images or brand mascots as talking avatars
- **Multilingual Content**: Generate videos in multiple languages using same avatar
- **Email Personalization**: Create personalized video messages for email campaigns
- **Social Media**: Consistent brand spokesperson across all video content
**Integration Points**:
- New "Avatar Studio" module
- Integration with persona system for brand voice consistency
- Connect with voice cloning for complete personalization
- LinkedIn personal branding features
**Pricing**: Starts at $0.15/5 seconds
---
#### 2.2 InfiniteTalk - Long-Form Avatar Lipsync
**Model**: `wavespeed-ai/infinitetalk`
**Capabilities**:
- Audio-driven avatar lipsync (image-to-video)
- Up to 10 minutes duration
- 480p/720p resolution
- Precise lip synchronization
- Full-body coherence (head, face, body movements)
- Identity preservation across unlimited length
- Instruction following (text prompts for scene/pose control)
**ALwrity Marketing Use Cases**:
- **Long-Form Content**: Create extended video content (tutorials, webinars, courses)
- **Podcast-to-Video**: Convert audio podcasts into video format with host avatar
- **Webinar Creation**: Generate webinar content with consistent presenter
- **Course Content**: Create educational course videos with instructor avatar
- **Interview Videos**: Transform audio interviews into video format
- **Thought Leadership**: Extended video content for LinkedIn and YouTube
- **Brand Storytelling**: Long-form brand narrative videos
**Integration Points**:
- Extended content creation for Story Writer
- Podcast-to-video conversion tool
- Course content generation module
- YouTube content creation workflow
**Pricing**:
- 480p: $0.15/5 seconds
- 720p: $0.30/5 seconds
- Billing capped at 600 seconds (10 minutes)
---
### 3. **Advanced Image Generation**
#### 3.1 Ideogram V3 Turbo - Photorealistic Image Generation
**Model**: `ideogram-ai/ideogram-v3-turbo`
**Capabilities**:
- High-quality photorealistic image generation
- Creative and styled image creation
- Consistent style maintenance
- Advanced prompt understanding
**ALwrity Marketing Use Cases**:
- **Social Media Visuals**: Create unique, brand-consistent images for social posts
- **Blog Post Images**: Generate custom featured images for blog articles
- **Ad Creative**: Create diverse ad visuals for A/B testing
- **Email Campaign Images**: Custom visuals for email marketing
- **Website Graphics**: Generate hero images, banners, and graphics
- **Product Mockups**: Create product visualization images
- **Brand Assets**: Consistent visual style across all marketing materials
**Integration Points**:
- Enhance existing image generation service
- LinkedIn image generation (already partially implemented)
- Blog Writer image suggestions
- Social media content calendar with image previews
---
#### 3.2 Qwen Image - Text-to-Image
**Model**: `wavespeed-ai/qwen-image/text-to-image`
**Capabilities**:
- High-quality text-to-image generation
- Diverse style options
- Fast generation times
**ALwrity Marketing Use Cases**:
- **Rapid Visual Creation**: Quick image generation for time-sensitive campaigns
- **A/B Testing**: Generate multiple image variations for testing
- **Content Library**: Build library of marketing visuals
- **Brand Consistency**: Maintain visual style across content
**Integration Points**:
- Alternative image generation provider
- Bulk image generation for content calendars
- Integration with content strategy module
---
### 4. **Voice Cloning & Audio Personalization**
#### 4.1 Minimax Voice Clone
**Model**: `minimax/voice-clone`
**Capabilities**:
- Clone voices from audio samples
- Generate personalized voiceovers
- Maintain voice characteristics
- Multilingual voice generation
**ALwrity Marketing Use Cases**:
- **Brand Voice Consistency**: Use founder/CEO voice across all video content
- **Personalized Marketing**: Create personalized video messages with customer's name
- **Multilingual Content**: Generate voiceovers in multiple languages with same voice
- **Podcast Production**: Create consistent podcast host voice
- **Video Narration**: Professional voiceovers for all video content
- **Email Audio**: Add personalized audio messages to email campaigns
- **Social Media**: Consistent voice across all video content
**Integration Points**:
- Connect with Hunyuan Avatar and InfiniteTalk for complete avatar solution
- Integration with WAN 2.5 for synchronized audio
- Voice library management system
- Brand voice consistency across all content
---
## Strategic Feature Prioritization
### Phase 1: High-Impact, Quick Wins (3-4 months)
1. **Alibaba WAN 2.5 Text-to-Video** - Expands video capabilities significantly
2. **Ideogram V3 Turbo** - Enhances existing image generation
3. **Alibaba WAN 2.5 Image-to-Video** - Repurposes existing image assets
**Rationale**: These features build on existing capabilities, require minimal new UI, and provide immediate value to users.
---
### Phase 2: Personalization & Engagement (4-6 months)
4. **Hunyuan Avatar** - Enables personalized video content
5. **Minimax Voice Clone** - Completes personalization suite
6. **Qwen Image** - Additional image generation option
**Rationale**: These features differentiate ALwrity by enabling true personalization, which is critical for modern marketing.
---
### Phase 3: Long-Form Content (6-8 months)
7. **InfiniteTalk** - Enables extended video content creation
**Rationale**: This feature opens new content types (courses, webinars) and requires more complex UI/workflow.
---
## Integration Architecture
### Backend Integration
```
backend/
├── services/
│ ├── llm_providers/
│ │ ├── wavespeed_video_generation.py # WAN 2.5 text/image-to-video
│ │ ├── wavespeed_avatar_generation.py # Hunyuan Avatar, InfiniteTalk
│ │ ├── wavespeed_image_generation.py # Ideogram, Qwen
│ │ └── minimax_voice_clone.py # Voice cloning
│ └── wavespeed/
│ ├── client.py # WaveSpeed API client
│ ├── models.py # Model configurations
│ └── pricing.py # Cost tracking
```
### Frontend Integration
```
frontend/src/
├── components/
│ ├── VideoCreator/
│ │ ├── TextToVideoSection.tsx
│ │ ├── ImageToVideoSection.tsx
│ │ └── VideoPreview.tsx
│ ├── AvatarStudio/
│ │ ├── AvatarCreator.tsx
│ │ ├── VoiceUpload.tsx
│ │ └── AvatarPreview.tsx
│ └── VoiceCloning/
│ ├── VoiceTrainer.tsx
│ └── VoiceLibrary.tsx
```
---
## Business Value & Competitive Advantages
### For Solopreneurs
1. **Cost Efficiency**: More affordable than Google Veo3, making professional video accessible
2. **Time Savings**: Automated video creation eliminates need for video production teams
3. **Multilingual Support**: Reach global audiences without translation teams
4. **Personalization at Scale**: Create personalized content without manual effort
5. **Content Repurposing**: Transform existing content (images, audio) into new formats
### For ALwrity Platform
1. **Market Differentiation**: Complete multimedia content creation platform
2. **Increased User Engagement**: Video content drives higher engagement
3. **Premium Feature Upsell**: Advanced video features for higher-tier plans
4. **Platform Stickiness**: Users create more content types, increasing retention
5. **Competitive Moat**: Comprehensive AI content suite unmatched by competitors
---
## Marketing Use Case Examples
### Use Case 1: Blog-to-Video Conversion
**Scenario**: User creates a blog post about "10 SEO Tips" and wants to convert it to video.
**Workflow**:
1. User selects blog post in ALwrity
2. Clicks "Create Video" button
3. ALwrity uses WAN 2.5 to generate video with synchronized narration
4. User can add custom audio or use AI-generated voice
5. Video is optimized for social media platforms
6. Automatically added to content calendar
**Value**: Single piece of content becomes multi-format, maximizing reach.
---
### Use Case 2: Personalized Email Campaign
**Scenario**: User wants to send personalized video messages to email subscribers.
**Workflow**:
1. User uploads their photo and records voice sample
2. ALwrity creates voice clone and avatar
3. User writes email campaign message
4. ALwrity generates personalized video for each recipient using Hunyuan Avatar
5. Videos are embedded in email campaign
6. Analytics track video engagement
**Value**: Personalized video emails have 3x higher open rates than text-only.
---
### Use Case 3: Multilingual Marketing Campaign
**Scenario**: User wants to launch product in multiple countries.
**Workflow**:
1. User creates video script in English
2. ALwrity translates script to target languages
3. Uses WAN 2.5 to generate videos in each language with native voice
4. Creates social media posts for each market
5. Schedules content for optimal times in each timezone
**Value**: Global reach without hiring multilingual teams.
---
### Use Case 4: Course Content Creation
**Scenario**: User wants to create online course with video lessons.
**Workflow**:
1. User uploads course outline and instructor photo
2. Records audio narration for each lesson
3. ALwrity uses InfiniteTalk to create 10-minute video lessons
4. Generates course thumbnails using Ideogram
5. Creates course landing page with video previews
6. Automatically uploads to course platform
**Value**: Professional course content without video production costs.
---
## Technical Considerations
### API Integration
- WaveSpeed provides REST API endpoints
- Need to handle async job processing (videos take time to generate)
- Implement polling or webhook system for job status
- Error handling and retry logic for failed generations
### Storage & CDN
- Video files are large (need efficient storage)
- CDN integration for fast video delivery
- Compression and optimization for web delivery
- Thumbnail generation for video previews
### Subscription & Usage Tracking
- Track video generation usage per user
- Implement rate limiting based on subscription tier
- Cost tracking for WaveSpeed API calls
- Usage analytics dashboard
### Performance Optimization
- Queue system for video generation jobs
- Background processing for long-running tasks
- Caching for frequently used avatars/voices
- Progressive loading for video previews
---
## Pricing Strategy Integration
### Subscription Tier Enhancements
- **Free Tier**: Limited video generation (e.g., 5 videos/month, 480p only)
- **Basic Tier**: Standard video features (20 videos/month, up to 720p)
- **Pro Tier**: Advanced features (50 videos/month, 1080p, avatar features)
- **Enterprise Tier**: Unlimited video generation, all features, custom voice cloning
### Usage-Based Add-ons
- Additional video generation credits
- Premium avatar features
- Extended video duration
- Custom voice cloning training
---
## Success Metrics
### User Engagement
- Video content creation rate
- Average videos per user per month
- Video engagement rates (views, shares)
- User retention (video creators vs. text-only)
### Business Metrics
- Revenue from premium video features
- Average revenue per user (ARPU) increase
- Customer lifetime value (LTV) improvement
- Churn rate reduction
### Content Performance
- Video content performance vs. text content
- Social media engagement rates
- Conversion rates from video content
- SEO performance of video-embedded content
---
## Implementation Roadmap
### Q1 2025: Foundation
- WaveSpeed API integration
- WAN 2.5 text-to-video implementation
- Basic video generation UI
- Usage tracking and billing
### Q2 2025: Enhancement
- WAN 2.5 image-to-video
- Ideogram image generation
- Advanced video settings UI
- Video library and management
### Q3 2025: Personalization
- Hunyuan Avatar integration
- Voice cloning (Minimax) integration
- Avatar studio UI
- Voice library management
### Q4 2025: Advanced Features
- InfiniteTalk for long-form content
- Qwen image generation
- Complete multimedia workflow
- Advanced analytics and optimization
---
## Risk Mitigation
### Technical Risks
- **API Reliability**: Implement retry logic and fallback providers
- **Cost Overruns**: Strict usage limits and pre-flight validation
- **Performance Issues**: Queue system and background processing
- **Storage Costs**: Efficient compression and CDN optimization
### Business Risks
- **Market Adoption**: Gradual rollout with user education
- **Competition**: Focus on unique value (personalization, integration)
- **Pricing Pressure**: Value-based pricing with clear ROI
- **User Experience**: Extensive testing and feedback loops
---
## Conclusion
Integrating WaveSpeed AI models into ALwrity transforms the platform from a text-focused content tool into a comprehensive multimedia marketing solution. These features align perfectly with ALwrity's mission to democratize professional marketing capabilities for solopreneurs.
The proposed features enable:
- **Complete Content Lifecycle**: From text to video to personalized multimedia
- **Cost-Effective Production**: Professional content without expensive production teams
- **Scalable Personalization**: Personalized content at scale
- **Global Reach**: Multilingual content creation
- **Competitive Advantage**: Unique feature set in the market
By implementing these features in a phased approach, ALwrity can deliver immediate value while building toward a comprehensive multimedia content platform that serves as the complete marketing solution for independent entrepreneurs.
---
## Next Steps
1. **Technical Feasibility Review**: Evaluate WaveSpeed API documentation and integration requirements
2. **Cost Analysis**: Calculate infrastructure and API costs for each feature
3. **User Research**: Survey existing users on video content needs and priorities
4. **Prototype Development**: Build MVP for highest-priority feature (WAN 2.5 text-to-video)
5. **Partnership Discussion**: Engage with WaveSpeed for partnership and pricing negotiations
---
*Document Version: 1.0*
*Last Updated: January 2025*
*Author: ALwrity Product Team*

View File

@@ -0,0 +1,165 @@
# WaveSpeed AI Integration: Executive Summary
## Quick Overview
This document summarizes how WaveSpeed AI models can enhance ALwrity's digital marketing platform with advanced video, avatar, image, and voice capabilities.
---
## 🎯 Key Features to Add
### 1. **Professional Video Creation**
- **WAN 2.5 Text-to-Video**: Create 480p/720p/1080p videos from text with synchronized audio
- **WAN 2.5 Image-to-Video**: Animate static images into dynamic videos
- **Use Cases**: Product demos, social media shorts, blog-to-video conversion, multilingual marketing
### 2. **AI Avatar & Personalization**
- **Hunyuan Avatar**: Create talking avatars from photos + audio (up to 2 minutes)
- **InfiniteTalk**: Long-form avatar videos with perfect lip-sync (up to 10 minutes)
- **Use Cases**: Personal branding, customer service videos, course content, personalized email campaigns
### 3. **Advanced Image Generation**
- **Ideogram V3 Turbo**: Photorealistic, creative image generation
- **Qwen Image**: Fast, high-quality text-to-image
- **Use Cases**: Social media visuals, ad creatives, blog images, brand assets
### 4. **Voice Cloning**
- **Minimax Voice Clone**: Clone voices for consistent brand audio
- **Use Cases**: Brand voice consistency, multilingual content, personalized marketing
---
## 💰 Pricing Comparison
| Feature | WaveSpeed Pricing | Current ALwrity | Benefit |
|---------|------------------|-----------------|---------|
| Text-to-Video (1080p) | $0.15/second | HuggingFace only | More affordable than Veo3 |
| Avatar Videos | $0.15-0.30/5s | Not available | New capability |
| Long-Form Video | $0.15-0.30/5s | Not available | Up to 10 minutes |
| Voice Cloning | TBD | Not available | New capability |
---
## 🚀 Implementation Priority
### Phase 1 (Q1 2025) - Quick Wins
1. ✅ WAN 2.5 Text-to-Video - Expands video capabilities
2. ✅ WAN 2.5 Image-to-Video - Repurposes existing images
3. ✅ Ideogram Image Generation - Enhances image quality
### Phase 2 (Q2-Q3 2025) - Personalization
4. ✅ Hunyuan Avatar - Personalized video content
5. ✅ Voice Cloning - Brand voice consistency
### Phase 3 (Q4 2025) - Advanced
6. ✅ InfiniteTalk - Long-form content creation
7. ✅ Qwen Image - Additional image option
---
## 📊 Business Value
### For Users (Solopreneurs)
- **Save Money**: No need for video production teams
- **Save Time**: Automated video creation
- **Scale Globally**: Multilingual content without translation teams
- **Personalize**: Create personalized content at scale
- **Repurpose**: Transform existing content into new formats
### For ALwrity
- **Differentiation**: Complete multimedia platform
- **Engagement**: Video drives 3x higher engagement
- **Revenue**: Premium features for higher-tier plans
- **Retention**: More content types = higher stickiness
- **Competitive Edge**: Unmatched AI content suite
---
## 🎬 Real-World Use Cases
### Use Case 1: Blog-to-Video
**Problem**: User has great blog post but wants video version
**Solution**: One-click conversion using WAN 2.5
**Result**: Single content piece becomes multi-format
### Use Case 2: Personalized Email Campaign
**Problem**: User wants personalized video messages
**Solution**: Hunyuan Avatar + Voice Clone
**Result**: 3x higher email open rates
### Use Case 3: Multilingual Launch
**Problem**: Launching product in multiple countries
**Solution**: WAN 2.5 with multilingual support
**Result**: Global reach without translation teams
### Use Case 4: Online Course Creation
**Problem**: Need professional course videos
**Solution**: InfiniteTalk for long-form content
**Result**: Professional course without production costs
---
## 🔧 Technical Requirements
### Backend
- WaveSpeed API client integration
- Async job processing (videos take time)
- Usage tracking and billing
- Storage and CDN for video files
### Frontend
- Video creation UI components
- Avatar studio interface
- Voice cloning interface
- Video library and management
### Infrastructure
- Video storage (large files)
- CDN for fast delivery
- Queue system for background jobs
- Cost monitoring and limits
---
## 📈 Success Metrics
- **User Engagement**: Video creation rate, videos per user
- **Business**: Revenue from premium features, ARPU increase
- **Content**: Video engagement rates, conversion rates
- **Retention**: Video creators vs. text-only users
---
## ⚠️ Risks & Mitigation
| Risk | Mitigation |
|------|------------|
| API Reliability | Retry logic, fallback providers |
| Cost Overruns | Strict usage limits, pre-flight validation |
| Performance | Queue system, background processing |
| Adoption | Gradual rollout, user education |
---
## ✅ Next Steps
1. **Review**: Technical feasibility and API documentation
2. **Analyze**: Cost structure and infrastructure needs
3. **Research**: User needs and priorities
4. **Prototype**: MVP for WAN 2.5 text-to-video
5. **Partner**: Engage WaveSpeed for pricing/partnership
---
## 📝 Key Takeaways
1. **Complete Multimedia Platform**: Transform ALwrity from text-focused to full multimedia
2. **Cost-Effective**: More affordable than competitors (Veo3, etc.)
3. **Personalization**: Unique avatar and voice cloning capabilities
4. **Scalability**: Multilingual and automated content creation
5. **Competitive Advantage**: Unmatched feature set in the market
---
*For detailed implementation plan, see `WAVESPEED_AI_FEATURE_PROPOSAL.md`*

View File

@@ -0,0 +1,335 @@
# WaveSpeed AI Integration: Complete Implementation Roadmap
## Overview
This document provides a unified roadmap for implementing WaveSpeed AI models across ALwrity's platform. It consolidates the three focused implementation plans:
1. **Story Writer Video Enhancement** - Immediate value, replace HuggingFace
2. **Persona Voice & Avatar Hyper-Personalization** - Core differentiator
3. **LinkedIn Writer Multimedia Revamp** - Engagement driver
---
## Implementation Priority Matrix
| Feature | Priority | Timeline | Impact | Effort |
|---------|----------|----------|--------|--------|
| Story Writer: WaveSpeed Video | **HIGH** | Week 1-2 | Immediate value, solves current issues | Medium |
| Story Writer: Voice Cloning | **HIGH** | Week 3-4 | Significant quality improvement | Medium |
| Persona: Voice Training | **HIGH** | Week 1-3 | Core hyper-personalization | High |
| Persona: Avatar Creation | **HIGH** | Week 4-6 | Visual personalization | High |
| LinkedIn: Video Posts | **HIGH** | Week 1-3 | Engagement driver | Medium |
| LinkedIn: Avatar Videos | **HIGH** | Week 6-7 | Personal branding | Medium |
| LinkedIn: Enhanced Images | **MEDIUM** | Week 4-5 | Quality improvement | Low |
| LinkedIn: Audio Narration | **MEDIUM** | Week 8-9 | Complete suite | Low |
---
## Phased Implementation Plan
### Phase 1: Foundation (Weeks 1-4)
**Goal**: Replace HuggingFace, add voice cloning to Story Writer
**Deliverables**:
- ✅ WaveSpeed WAN 2.5 video generation
- ✅ Minimax voice cloning
- ✅ Story Writer video enhancement
- ✅ Story Writer audio enhancement
- ✅ Cost management and validation
**Success Criteria**:
- Story Writer videos work reliably
- Voice quality significantly improved
- Cost tracking accurate
- User satisfaction improved
---
### Phase 2: Hyper-Personalization (Weeks 1-6)
**Goal**: Integrate voice and avatar into Persona System
**Deliverables**:
- ✅ Voice training in onboarding
- ✅ Avatar creation in onboarding
- ✅ Persona voice integration
- ✅ Persona avatar integration
- ✅ Persona dashboard enhancements
**Success Criteria**:
- Users can train voice/avatar during onboarding
- Persona voice/avatar used across platform
- Brand consistency achieved
- High adoption rate (>60% Pro users)
---
### Phase 3: LinkedIn Multimedia (Weeks 1-9)
**Goal**: Transform LinkedIn Writer into multimedia platform
**Deliverables**:
- ✅ Video post generation
- ✅ Avatar video posts
- ✅ Enhanced image generation
- ✅ Audio narration
- ✅ Unified multimedia creator
**Success Criteria**:
- Users can create multimedia LinkedIn posts
- Engagement rates improved (3x target)
- High-quality content generation
- Cost-effective for users
---
## Shared Infrastructure
### Common Services
**WaveSpeed API Client** (`backend/services/wavespeed/`):
- Shared across Story Writer, LinkedIn, Persona
- Unified error handling
- Cost tracking
- Rate limiting
**Voice Cloning Service** (`backend/services/minimax/`):
- Shared across Story Writer, LinkedIn, Persona
- Voice library management
- Training queue
- Usage tracking
**Avatar Service** (`backend/services/wavespeed/avatar/`):
- Shared across LinkedIn, Persona
- Avatar library management
- Generation queue
- Usage tracking
### Cost Management
**Unified Cost Tracking**:
- Pre-flight validation across all features
- Real-time cost estimation
- Usage limits per tier
- Cost optimization recommendations
**Subscription Integration**:
- Unified pricing service
- Tier-based feature access
- Usage tracking and alerts
- Cost breakdown analytics
---
## Resource Allocation
### Development Team
**Backend Developers** (2-3):
- Week 1-2: WaveSpeed integration
- Week 3-4: Voice cloning integration
- Week 5-6: Avatar integration
- Week 7-9: LinkedIn multimedia
**Frontend Developers** (2):
- Week 1-2: Story Writer UI updates
- Week 3-4: Voice training UI
- Week 5-6: Avatar creation UI
- Week 7-9: LinkedIn multimedia UI
**QA/Testing** (1):
- Continuous testing throughout
- User acceptance testing
- Performance testing
- Cost validation testing
### Timeline Summary
```
Month 1 (Weeks 1-4):
├─ Story Writer: WaveSpeed + Voice Cloning
└─ Persona: Voice Training
Month 2 (Weeks 5-8):
├─ Persona: Avatar Creation
├─ LinkedIn: Video Posts
└─ LinkedIn: Enhanced Images
Month 3 (Weeks 9-12):
├─ LinkedIn: Avatar Videos
├─ LinkedIn: Audio Narration
└─ Complete Integration & Polish
```
---
## Cost Management Strategy
### Pre-Flight Validation
**Implementation**: Unified validation service
**Checks**:
1. User subscription tier
2. Feature availability
3. Usage limits
4. Cost estimates
5. Budget remaining
**Benefits**:
- Prevents wasted API calls
- Clear user feedback
- Cost transparency
- Better user experience
### Cost Optimization
**Strategies**:
1. **Default to Cost-Effective Options**: 480p/720p default, 1080p premium
2. **Batch Processing**: Lower costs for multiple items
3. **Caching**: Reuse generated content when possible
4. **Smart Defaults**: Optimize settings automatically
5. **Usage Limits**: Per-tier limits prevent overuse
### Pricing Transparency
**User-Facing**:
- Real-time cost estimates
- Per-feature cost breakdown
- Monthly budget tracking
- Cost optimization suggestions
---
## Success Metrics
### Technical Metrics
- API success rate >95%
- Average generation time <30s
- Error rate <2%
- Cost accuracy >99%
### User Metrics
- Feature adoption rate >50%
- User satisfaction >4.5/5
- Content quality >4.5/5
- Retention improvement >20%
### Business Metrics
- Premium tier conversion +30%
- User engagement +200%
- Content generation volume +150%
- Cost per user <$10/month average
---
## Risk Management
### Technical Risks
| Risk | Probability | Impact | Mitigation |
|------|------------|--------|------------|
| API reliability | Medium | High | Retry logic, fallbacks |
| Cost overruns | Medium | High | Pre-flight validation |
| Quality issues | Low | Medium | Quality checks, previews |
| Performance | Low | Medium | Queue system, optimization |
### Business Risks
| Risk | Probability | Impact | Mitigation |
|------|------------|--------|------------|
| Low adoption | Medium | Medium | User education, tutorials |
| High costs | Low | High | Tier limits, cost estimates |
| User confusion | Medium | Low | Clear UI, documentation |
| Competition | Low | Medium | Unique features, quality |
---
## Dependencies
### External Dependencies
- WaveSpeed API access and credentials
- Minimax API access and credentials
- API documentation and support
- Pricing agreements
### Internal Dependencies
- Persona system (existing)
- Subscription system (existing)
- Story Writer (existing)
- LinkedIn Writer (existing)
- Cost tracking infrastructure
---
## Next Steps
### Immediate (Week 1)
1. ✅ Secure WaveSpeed API access
2. ✅ Secure Minimax API access
3. ✅ Review API documentation
4. ✅ Set up development environment
5. ✅ Create project plan and assign tasks
### Short-term (Weeks 2-4)
1. ✅ Implement WaveSpeed video generation
2. ✅ Implement voice cloning
3. ✅ Update Story Writer
4. ✅ Testing and optimization
### Medium-term (Weeks 5-8)
1. ✅ Implement persona voice/avatar
2. ✅ Implement LinkedIn video posts
3. ✅ Testing and optimization
### Long-term (Weeks 9-12)
1. ✅ Complete LinkedIn multimedia suite
2. ✅ Full integration testing
3. ✅ User acceptance testing
4. ✅ Documentation and launch
---
## Documentation
### For Developers
- API integration guides
- Service architecture docs
- Testing procedures
- Deployment guides
### For Users
- Feature guides
- Video tutorials
- Best practices
- FAQ and troubleshooting
### For Business
- Cost analysis
- ROI projections
- Success metrics
- Competitive analysis
---
## Conclusion
This roadmap provides a comprehensive plan for integrating WaveSpeed AI models into ALwrity, transforming it from a text-focused platform into a complete multimedia content creation suite. The phased approach ensures:
1. **Immediate Value**: Story Writer improvements solve current issues
2. **Core Differentiation**: Persona hyper-personalization sets ALwrity apart
3. **Engagement Growth**: LinkedIn multimedia drives user engagement
4. **Cost Effectiveness**: Careful cost management prevents waste
5. **Scalable Foundation**: Shared infrastructure supports future growth
**Key Success Factors**:
- Phased implementation reduces risk
- Cost management prevents waste
- User education ensures adoption
- Quality focus ensures satisfaction
- Integration creates competitive advantage
---
*Document Version: 1.0*
*Last Updated: January 2025*
*Status: Ready for Implementation*

View File

@@ -95,25 +95,38 @@ const StoryExport: React.FC<StoryExportProps> = ({ state }) => {
try {
// Prepare image and audio URLs in scene order
const imageUrls: string[] = [];
const imageUrls: (string | null)[] = [];
const audioUrls: string[] = [];
const scenes = state.outlineScenes;
const videoUrls: (string | null)[] = [];
for (const scene of scenes) {
const sceneNumber = scene.scene_number || scenes.indexOf(scene) + 1;
const imageUrl = state.sceneImages?.get(sceneNumber);
const audioUrl = state.sceneAudio?.get(sceneNumber);
const animatedVideoUrl = state.sceneAnimatedVideos?.get(sceneNumber);
if (imageUrl && audioUrl) {
imageUrls.push(imageUrl);
audioUrls.push(audioUrl);
} else {
throw new Error(`Missing image or audio for scene ${sceneNumber}`);
if (!audioUrl) {
throw new Error(`Missing audio for scene ${sceneNumber}`);
}
// Prefer animated video if available, otherwise use image
if (animatedVideoUrl) {
videoUrls.push(animatedVideoUrl);
imageUrls.push(null);
} else if (imageUrl) {
videoUrls.push(null);
imageUrls.push(imageUrl);
} else {
throw new Error(`Missing image or animated video for scene ${sceneNumber}`);
}
audioUrls.push(audioUrl);
}
if (imageUrls.length !== scenes.length || audioUrls.length !== scenes.length) {
throw new Error('Number of images and audio files must match number of scenes');
throw new Error('Number of images/videos and audio files must match number of scenes');
}
// Start async video generation
@@ -121,6 +134,8 @@ const StoryExport: React.FC<StoryExportProps> = ({ state }) => {
scenes: scenes,
image_urls: imageUrls,
audio_urls: audioUrls,
video_urls: videoUrls.length > 0 ? videoUrls : undefined,
ai_audio_urls: undefined, // TODO: Track AI audio separately in state
story_title: state.storySetting || 'Story',
fps: state.videoFps,
transition_duration: state.videoTransitionDuration,
@@ -147,7 +162,11 @@ const StoryExport: React.FC<StoryExportProps> = ({ state }) => {
state.setStoryVideo(videoUrl);
// fetch blob for authenticated preview
const blobUrl = await fetchMediaBlobUrl(videoUrl);
setVideoBlobUrl(blobUrl);
if (blobUrl) {
setVideoBlobUrl(blobUrl);
} else {
setVideoBlobUrl(null);
}
setVideoProgress(100);
setVideoMessage('Video generation complete');
state.setError(null);
@@ -175,6 +194,9 @@ const StoryExport: React.FC<StoryExportProps> = ({ state }) => {
const handleDownloadVideo = async () => {
if (state.storyVideo) {
const blobUrl = await fetchMediaBlobUrl(state.storyVideo);
if (!blobUrl) {
return;
}
const a = document.createElement('a');
a.href = blobUrl;
a.download = `story-video-${Date.now()}.mp4`;

View File

@@ -14,9 +14,9 @@ import GlobalStyles from '@mui/material/GlobalStyles';
import ImageIcon from '@mui/icons-material/Image';
import VolumeUpIcon from '@mui/icons-material/VolumeUp';
import { motion, AnimatePresence } from 'framer-motion';
import { useStoryWriterState } from '../../../hooks/useStoryWriterState';
import { useStoryWriterState, SceneAnimationResume } from '../../../hooks/useStoryWriterState';
import { storyWriterApi } from '../../../services/storyWriterApi';
import { aiApiClient } from '../../../api/client';
import { aiApiClient, triggerSubscriptionError } from '../../../api/client';
import OutlineHoverActions from './StoryOutlineParts/OutlineHoverActions';
import EditSectionModal from './StoryOutlineParts/EditSectionModal';
import { leftPageVariants, rightPageVariants } from './StoryOutlineParts/pageVariants';
@@ -48,7 +48,9 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
const [imageLoadError, setImageLoadError] = useState<Set<number>>(new Set());
const [imageBlobUrls, setImageBlobUrls] = useState<Map<number, string>>(new Map());
const [audioBlobUrls, setAudioBlobUrls] = useState<Map<number, string>>(new Map());
const [videoBlobUrls, setVideoBlobUrls] = useState<Map<number, string>>(new Map());
const [audioLoadError, setAudioLoadError] = useState<Set<number>>(new Set());
const [hasVideoLoadError, setVideoLoadError] = useState<Set<number>>(new Set());
const [outlineToastOpen, setOutlineToastOpen] = useState(false);
const lastToastSceneCount = useRef<number | null>(null);
const [isEditModalOpen, setIsEditModalOpen] = useState(false);
@@ -66,15 +68,182 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
const [isKeyEventsModalOpen, setIsKeyEventsModalOpen] = useState(false);
const [isTitleModalOpen, setIsTitleModalOpen] = useState(false);
const [titleDraft, setTitleDraft] = useState('');
const [animatingSceneNumber, setAnimatingSceneNumber] = useState<number | null>(null);
// Use state from hook instead of local state
const sceneImages = state.sceneImages || new Map<number, string>();
const sceneAudio = state.sceneAudio || new Map<number, string>();
const sceneAnimatedVideos = state.sceneAnimatedVideos || new Map<number, string>();
const sceneAnimationResumables = state.sceneAnimationResumables || new Map<number, SceneAnimationResume>();
const updateSceneAnimatedVideo = (sceneNumber: number, videoUrl: string) => {
const nextMap = new Map(state.sceneAnimatedVideos || []);
nextMap.set(sceneNumber, videoUrl);
state.setSceneAnimatedVideos(nextMap);
// Clear the blob URL for this scene so it reloads with the new video
setVideoBlobUrls((prev) => {
const next = new Map(prev);
const oldBlobUrl = next.get(sceneNumber);
if (oldBlobUrl) {
URL.revokeObjectURL(oldBlobUrl);
}
next.delete(sceneNumber);
return next;
});
// Clear any error state for this scene
setVideoLoadError((prev) => {
const next = new Set(prev);
next.delete(sceneNumber);
return next;
});
};
const handleAnimateSceneWithVoiceover = async () => {
if (!hasScenes || !currentScene) {
setError('Please generate your outline before animating scenes.');
return;
}
const sceneNumber = currentScene.scene_number || currentSceneIndex + 1;
const sceneImageRelativeUrl = state.sceneImages?.get(sceneNumber);
const sceneAudioRelativeUrl = state.sceneAudio?.get(sceneNumber);
if (!sceneImageRelativeUrl) {
setError('Please generate an image for this scene before animating it.');
return;
}
if (!sceneAudioRelativeUrl) {
setError('Please generate narration audio for this scene before animating with voiceover.');
return;
}
setAnimatingSceneNumber(sceneNumber);
setError(null);
updateSceneAnimationResume(sceneNumber, undefined);
const storyContextPayload = createStoryContextPayload();
try {
console.info('[Outline] Animate scene with voiceover requested', {
sceneNumber,
image: sceneImageRelativeUrl,
audio: sceneAudioRelativeUrl,
});
// Start async task
const startResponse = await storyWriterApi.animateSceneVoiceover({
scene_number: sceneNumber,
scene_data: currentScene,
story_context: storyContextPayload,
image_url: sceneImageRelativeUrl,
audio_url: sceneAudioRelativeUrl,
resolution: '720p',
});
// Poll for completion (InfiniteTalk can take up to 10 minutes)
const taskId = startResponse.task_id;
let done = false;
while (!done) {
await new Promise((r) => setTimeout(r, 2000)); // Poll every 2 seconds
const status = await storyWriterApi.getTaskStatus(taskId);
if (status.status === 'completed') {
done = true;
const result = await storyWriterApi.getTaskResult(taskId);
// Extract AnimateSceneResponse from result
// The result can be either the AnimateSceneResponse directly or wrapped in a result field
const animationResult = (result as any).result || result;
const videoUrl = animationResult.video_url;
const cost = animationResult.cost || 0;
if (videoUrl) {
updateSceneAnimatedVideo(sceneNumber, videoUrl);
console.info('[Outline] Animate with voiceover completed', {
sceneNumber,
video: videoUrl,
cost: cost,
});
} else {
throw new Error('Video URL not found in result');
}
} else if (status.status === 'failed') {
throw new Error(status.error || 'InfiniteTalk animation failed');
}
// Continue polling if status is 'pending' or 'processing'
}
} catch (err: any) {
const detail = err?.response?.data?.detail;
const handled = await triggerSubscriptionError(err);
const message = extractDetailMessage(detail, err.message || 'Failed to animate scene with voiceover.');
setError(message);
if (!handled) {
console.error('[Outline] Animate scene with voiceover failed', err);
}
} finally {
setAnimatingSceneNumber(null);
}
};
const updateSceneAnimationResume = (sceneNumber: number, info?: SceneAnimationResume) => {
const prevMap = state.sceneAnimationResumables || new Map<number, SceneAnimationResume>();
const nextMap = new Map(prevMap);
if (info) {
nextMap.set(sceneNumber, info);
} else {
nextMap.delete(sceneNumber);
}
state.setSceneAnimationResumables(nextMap.size > 0 ? nextMap : null);
};
const extractDetailMessage = (detail: any, fallback: string): string => {
if (!detail) return fallback;
if (typeof detail === 'string') return detail;
if (typeof detail === 'object') {
if (typeof detail.message === 'string') return detail.message;
if (typeof detail.error === 'string') return detail.error;
if (typeof detail.detail === 'string') return detail.detail;
}
return fallback;
};
const captureResumeOpportunity = (
sceneNumber: number,
duration: 5 | 10,
detail: any
): string | null => {
if (!detail || typeof detail !== 'object') {
return null;
}
if (!detail.resume_available || !detail.prediction_id) {
return null;
}
const message =
typeof detail.message === 'string'
? detail.message
: typeof detail.error === 'string'
? detail.error
: 'WaveSpeed is still finalizing this animation. Click Resume to download without extra cost.';
updateSceneAnimationResume(sceneNumber, {
predictionId: detail.prediction_id,
duration,
message,
createdAt: new Date().toISOString(),
});
return message;
};
const scenes = state.outlineScenes || [];
const sceneCount = scenes.length;
const hasScenes = state.isOutlineStructured && scenes.length > 0;
const hasOutlineScenes = Boolean(state.outlineScenes && state.outlineScenes.length > 0);
const resumableScenesArray = Array.from(sceneAnimationResumables.entries());
const resumableSummaryMessage =
resumableScenesArray.length === 0
? null
: resumableScenesArray.length === 1
? resumableScenesArray[0][1]?.message ||
`Scene ${resumableScenesArray[0][0]} animation is ready to resume without extra cost.`
: `Scenes ${resumableScenesArray.map(([scene]) => scene).join(', ')} have WaveSpeed animations ready to resume without extra cost. Open each scene and click Resume Animation.`;
// removed old accordion renderer (unused)
@@ -98,10 +267,14 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
// Get the current scene's image URL
const currentSceneNumber = currentScene?.scene_number || currentSceneIndex + 1;
const currentSceneResumeInfo = sceneAnimationResumables.get(currentSceneNumber) || null;
const canAnimateCurrentScene = !animatingSceneNumber && !currentSceneResumeInfo;
const isCurrentSceneAnimating = animatingSceneNumber === currentSceneNumber;
const currentSceneImageUrl = sceneImages.get(currentSceneNumber);
const hasImageLoadError = imageLoadError.has(currentSceneNumber);
const currentSceneAudioUrl = sceneAudio.get(currentSceneNumber);
const hasAudioLoadError = audioLoadError.has(currentSceneNumber);
const hasAudioForScene = Boolean(currentSceneAudioUrl);
// Fetch image as blob with authentication
useEffect(() => {
@@ -128,8 +301,12 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
next.set(currentSceneNumber, blobUrl);
return next;
});
} catch (err) {
console.error('Failed to load image:', err);
} catch (err: any) {
// Only log non-404 errors (404 means file doesn't exist, which is acceptable)
if (err?.response?.status !== 404) {
console.error('Failed to load image:', err);
}
// Mark as error to prevent retries
setImageLoadError((prev) => new Set(prev).add(currentSceneNumber));
}
};
@@ -137,6 +314,47 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
loadImage();
}, [currentSceneNumber, currentSceneImageUrl, hasImageLoadError]);
// Fetch video as blob with authentication
useEffect(() => {
const animatedVideoRelativeUrl = sceneAnimatedVideos.get(currentSceneNumber);
if (!animatedVideoRelativeUrl || hasVideoLoadError.has(currentSceneNumber) || videoBlobUrls.has(currentSceneNumber)) {
return;
}
const loadVideo = async () => {
try {
// Remove query parameters (token) from URL if present, we'll use authenticated request instead
const cleanUrl = animatedVideoRelativeUrl.split('?')[0];
// Use relative URL path directly (aiApiClient will add base URL and auth)
const videoUrl = cleanUrl.startsWith('/')
? cleanUrl
: `/${cleanUrl}`;
// Use aiApiClient to get authenticated response with blob
const response = await aiApiClient.get(videoUrl, {
responseType: 'blob',
});
const blob = response.data;
const blobUrl = URL.createObjectURL(blob);
setVideoBlobUrls((prev) => {
const next = new Map(prev);
next.set(currentSceneNumber, blobUrl);
return next;
});
} catch (err: any) {
// Only log non-404 errors (404 means file doesn't exist, which is acceptable)
if (err?.response?.status !== 404) {
console.error('Failed to load video:', err);
}
// Mark as error to prevent retries
setVideoLoadError((prev) => new Set(prev).add(currentSceneNumber));
}
};
loadVideo();
}, [currentSceneNumber, sceneAnimatedVideos, hasVideoLoadError, videoBlobUrls]);
// Cleanup blob URLs when component unmounts or scenes change
useEffect(() => {
return () => {
@@ -147,13 +365,36 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
audioBlobUrls.forEach((blobUrl) => {
URL.revokeObjectURL(blobUrl);
});
videoBlobUrls.forEach((blobUrl) => {
URL.revokeObjectURL(blobUrl);
});
};
}, []);
const currentSceneImageFullUrl = imageBlobUrls.get(currentSceneNumber) || null;
const currentSceneAudioFullUrl = audioBlobUrls.get(currentSceneNumber) || null;
const resolvedSceneAudioUrl =
currentSceneAudioFullUrl ||
(currentSceneAudioUrl ? storyWriterApi.getAudioUrl(currentSceneAudioUrl) : null);
const currentSceneAnimatedVideoUrl = videoBlobUrls.get(currentSceneNumber) || null;
// Reset image load error when scene changes
const createStoryContextPayload = () => ({
persona: state.persona,
story_setting: state.storySetting,
characters: state.characters,
plot_elements: state.plotElements,
writing_style: state.writingStyle,
story_tone: state.storyTone,
narrative_pov: state.narrativePOV,
audience_age_group: state.audienceAgeGroup,
content_rating: state.contentRating,
story_length: state.storyLength,
premise: state.premise,
outline: state.outline,
story_content: state.storyContent,
});
// Reset image/audio/video load errors when scene changes (to allow retry for new scene)
useEffect(() => {
setImageLoadError((prev) => {
const next = new Set(prev);
@@ -165,6 +406,11 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
next.delete(currentSceneNumber);
return next;
});
setVideoLoadError((prev) => {
const next = new Set(prev);
next.delete(currentSceneNumber);
return next;
});
}, [currentSceneNumber]);
useEffect(() => {
@@ -192,9 +438,20 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
const loadAudio = async () => {
try {
const audioPath = currentSceneAudioUrl.startsWith('/')
? currentSceneAudioUrl
: `/${currentSceneAudioUrl}`;
// Remove query parameters (token) from URL if present, we'll use authenticated request instead
const cleanUrl = currentSceneAudioUrl.split('?')[0];
// Normalize path - ensure it starts with /api/story/audio/
let audioPath = cleanUrl.startsWith('/')
? cleanUrl
: `/${cleanUrl}`;
// If path doesn't include /api/story/audio/, add it
if (!audioPath.includes('/api/story/audio/')) {
// Extract filename from path
const filename = cleanUrl.split('/').pop() || cleanUrl;
audioPath = `/api/story/audio/${filename}`;
}
const response = await aiApiClient.get(audioPath, {
responseType: 'blob',
});
@@ -210,8 +467,19 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
next.set(currentSceneNumber, blobUrl);
return next;
});
} catch (err) {
console.error('Failed to load audio:', err);
} catch (err: any) {
// Only log non-404 errors (404 means file doesn't exist, which is acceptable)
if (err?.response?.status !== 404) {
console.error(`Failed to load audio for scene ${currentSceneNumber}:`, err);
console.error(`Audio URL was: ${currentSceneAudioUrl}`);
// If auth error, log more details
if (err?.response?.status === 401) {
console.error(`Authentication failed for audio file. Make sure auth token is set.`);
}
}
// Mark as error to prevent retries
setAudioLoadError((prev) => new Set(prev).add(currentSceneNumber));
}
};
@@ -444,6 +712,104 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
}
};
const handleAnimateScene = async () => {
if (!hasScenes || !currentScene) {
setError('Please generate your outline before animating scenes.');
return;
}
const sceneNumber = currentScene.scene_number || currentSceneIndex + 1;
const sceneImageRelativeUrl = state.sceneImages?.get(sceneNumber);
if (!sceneImageRelativeUrl) {
setError('Please generate an image for this scene before animating it.');
return;
}
setAnimatingSceneNumber(sceneNumber);
setError(null);
updateSceneAnimationResume(sceneNumber, undefined);
const storyContextPayload = createStoryContextPayload();
const animationDuration: 5 | 10 = 5;
try {
console.info(
`[Outline] Animate scene requested`,
{ sceneNumber, duration: 5, image: sceneImageRelativeUrl }
);
const response = await storyWriterApi.animateScene({
scene_number: sceneNumber,
scene_data: currentScene,
story_context: storyContextPayload,
image_url: sceneImageRelativeUrl,
duration: animationDuration,
});
updateSceneAnimatedVideo(sceneNumber, response.video_url);
updateSceneAnimationResume(sceneNumber, undefined);
console.info(
`[Outline] Animate scene completed`,
{
sceneNumber,
video: response.video_url,
cost: response.cost,
prediction: response.prediction_id || 'n/a',
}
);
} catch (err: any) {
const detail = err?.response?.data?.detail;
const resumeMessage = captureResumeOpportunity(sceneNumber, animationDuration, detail);
const handled = await triggerSubscriptionError(err);
const message = resumeMessage || extractDetailMessage(detail, err.message || 'Failed to animate scene.');
setError(message);
if (!resumeMessage || !handled) {
console.error('[Outline] Animate scene failed', err);
}
} finally {
setAnimatingSceneNumber(null);
}
};
const handleResumeSceneAnimation = async (
sceneNumber: number,
resumeInfo: SceneAnimationResume
) => {
setAnimatingSceneNumber(sceneNumber);
setError(null);
try {
console.info('[Outline] Resume scene requested', {
sceneNumber,
prediction: resumeInfo.predictionId,
});
const response = await storyWriterApi.resumeAnimateScene({
prediction_id: resumeInfo.predictionId,
scene_number: sceneNumber,
duration: resumeInfo.duration,
});
updateSceneAnimatedVideo(sceneNumber, response.video_url);
updateSceneAnimationResume(sceneNumber, undefined);
console.info('[Outline] Resume scene completed', {
sceneNumber,
video: response.video_url,
cost: response.cost,
prediction: response.prediction_id || resumeInfo.predictionId,
});
} catch (err: any) {
const detail = err?.response?.data?.detail;
const message = extractDetailMessage(detail, err.message || 'Failed to resume animation.');
setError(message);
await triggerSubscriptionError(err);
console.error('[Outline] Resume scene failed', err);
} finally {
setAnimatingSceneNumber(null);
}
};
const handleRegenerateCurrentSceneImage = async () => {
if (!hasScenes || !currentScene) return;
setIsRegeneratingSceneImage(true);
@@ -532,6 +898,12 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
</Alert>
)}
{resumableSummaryMessage && (
<Alert severity="info" sx={{ mb: 3 }}>
{resumableSummaryMessage}
</Alert>
)}
{!state.premise && (
<Alert severity="warning" sx={{ mb: 3 }}>
Please generate a premise first in the Setup phase.
@@ -552,17 +924,24 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
imageUrl={currentSceneImageFullUrl}
onImageError={() => setImageLoadError((prev) => new Set(prev).add(currentSceneNumber))}
narrationEnabled={!!state.enableNarration}
audioUrl={
currentSceneAudioFullUrl || (state.sceneAudio && state.sceneAudio.get(currentSceneNumber)
? storyWriterApi.getAudioUrl(state.sceneAudio.get(currentSceneNumber) || '')
: null)
}
audioUrl={resolvedSceneAudioUrl || null}
hasAudio={hasAudioForScene}
onOpenImageModal={openImageModal}
onOpenAudioModal={openAudioModal}
onOpenCharactersModal={openCharactersModal}
onOpenKeyEventsModal={openKeyEventsModal}
onOpenTitleModal={openTitleModal}
onOpenEditModal={openEditModal}
onAnimateScene={canAnimateCurrentScene ? handleAnimateScene : undefined}
onAnimateWithVoiceover={hasAudioForScene ? handleAnimateSceneWithVoiceover : undefined}
onResumeScene={
currentSceneResumeInfo && !animatingSceneNumber
? () => handleResumeSceneAnimation(currentSceneNumber, currentSceneResumeInfo)
: undefined
}
resumeInfo={currentSceneResumeInfo}
isAnimatingScene={isCurrentSceneAnimating}
animatedVideoUrl={currentSceneAnimatedVideoUrl}
/>
<OutlineActionsBar
isGenerating={isGenerating}
@@ -617,6 +996,50 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
(state.setOutlineScenes as any)(updated);
setIsImageModalOpen(false);
}}
onRegenerate={async (prompt: string) => {
if (!hasScenes || !currentScene) return;
setIsRegeneratingSceneImage(true);
try {
const sceneNum = currentScene.scene_number || currentSceneIndex + 1;
const sceneTitle = currentScene.title || `Scene ${sceneNum}`;
const resp = await storyWriterApi.regenerateSceneImage({
scene_number: sceneNum,
scene_title: sceneTitle,
prompt: prompt.trim(),
provider: state.imageProvider || undefined,
width: state.imageWidth,
height: state.imageHeight,
model: state.imageModel || undefined,
});
if (resp.success && resp.image_url) {
const nextMap = new Map(state.sceneImages || []);
nextMap.set(sceneNum, resp.image_url);
state.setSceneImages(nextMap);
// Update the scene with the new prompt if generation was successful
const updated = [...scenes];
updated[currentSceneIndex] = { ...updated[currentSceneIndex], image_prompt: prompt.trim() };
(state.setOutlineScenes as any)(updated);
setImagePromptDraft(prompt.trim());
// Close the modal after successful regeneration
setIsImageModalOpen(false);
} else {
throw new Error(resp.error || 'Failed to regenerate image');
}
} catch (err: any) {
console.error('Failed to regenerate scene image:', err);
throw err; // Re-throw to be handled by modal
} finally {
setIsRegeneratingSceneImage(false);
}
}}
imageProvider={state.imageProvider}
imageWidth={state.imageWidth}
imageHeight={state.imageHeight}
imageModel={state.imageModel}
/>
<AudioScriptModal
open={isAudioModalOpen}
@@ -644,6 +1067,94 @@ const StoryOutline: React.FC<StoryOutlineProps> = ({ state, onNext }) => {
? storyWriterApi.getAudioUrl(state.sceneAudio.get(currentSceneNumber) || '')
: currentSceneAudioFullUrl) || null
}
onGenerateAI={async (params: {
text: string;
voice_id?: string;
speed?: number;
volume?: number;
pitch?: number;
emotion?: string;
}) => {
if (!hasScenes || !currentScene) return;
setIsRegeneratingSceneAudio(true);
try {
const sceneNum = currentScene.scene_number || currentSceneIndex + 1;
const sceneTitle = currentScene.title || `Scene ${sceneNum}`;
const resp = await storyWriterApi.generateAIAudio({
scene_number: sceneNum,
scene_title: sceneTitle,
text: params.text.trim(),
voice_id: params.voice_id || 'Wise_Woman',
speed: params.speed !== undefined ? params.speed : 1.0,
volume: params.volume !== undefined ? params.volume : 1.0,
pitch: params.pitch !== undefined ? params.pitch : 0.0,
emotion: params.emotion || 'happy',
});
if (resp.success && resp.audio_url) {
const nextMap = new Map(state.sceneAudio || []);
nextMap.set(sceneNum, resp.audio_url);
state.setSceneAudio(nextMap);
// Update the scene with the new audio_narration if generation was successful
const updated = [...scenes];
updated[currentSceneIndex] = { ...updated[currentSceneIndex], audio_narration: params.text.trim() };
(state.setOutlineScenes as any)(updated);
setAudioScriptDraft(params.text.trim());
// Close the modal after successful generation
setIsAudioModalOpen(false);
} else {
throw new Error(resp.error || 'Failed to generate AI audio');
}
} catch (err: any) {
console.error('Failed to generate AI audio:', err);
throw err; // Re-throw to be handled by modal
} finally {
setIsRegeneratingSceneAudio(false);
}
}}
onGenerateFree={async (text: string) => {
if (!hasScenes || !currentScene) return;
setIsRegeneratingSceneAudio(true);
try {
const sceneNum = currentScene.scene_number || currentSceneIndex + 1;
const sceneTitle = currentScene.title || `Scene ${sceneNum}`;
const resp = await storyWriterApi.generateFreeAudio({
scene_number: sceneNum,
scene_title: sceneTitle,
text: text.trim(),
provider: state.audioProvider || 'gtts',
lang: state.audioLang || 'en',
slow: state.audioSlow || false,
rate: state.audioRate || 150,
});
if (resp.success && resp.audio_url) {
const nextMap = new Map(state.sceneAudio || []);
nextMap.set(sceneNum, resp.audio_url);
state.setSceneAudio(nextMap);
// Update the scene with the new audio_narration if generation was successful
const updated = [...scenes];
updated[currentSceneIndex] = { ...updated[currentSceneIndex], audio_narration: text.trim() };
(state.setOutlineScenes as any)(updated);
setAudioScriptDraft(text.trim());
// Close the modal after successful generation
setIsAudioModalOpen(false);
} else {
throw new Error(resp.error || 'Failed to generate free audio');
}
} catch (err: any) {
console.error('Failed to generate free audio:', err);
throw err; // Re-throw to be handled by modal
} finally {
setIsRegeneratingSceneAudio(false);
}
}}
/>
<CharactersModal
open={isCharactersModalOpen}

View File

@@ -1,5 +1,14 @@
import React from 'react';
import { Box, Button, Dialog, DialogActions, DialogContent, DialogTitle, TextField } from '@mui/material';
import {
Box, Button, Dialog, DialogActions, DialogContent, DialogTitle,
TextField, Divider, CircularProgress, Typography, Tooltip, IconButton,
Slider, FormControl, InputLabel, Select, MenuItem, FormHelperText,
ToggleButtonGroup, ToggleButton
} from '@mui/material';
import VolumeUpIcon from '@mui/icons-material/VolumeUp';
import SmartToyIcon from '@mui/icons-material/SmartToy';
import InfoOutlinedIcon from '@mui/icons-material/InfoOutlined';
import { OperationButton } from '../../../shared/OperationButton';
interface AudioScriptModalProps {
open: boolean;
@@ -18,14 +27,114 @@ interface AudioScriptModalProps {
onChangeSlow: (v: boolean) => void;
onChangeRate: (v: number) => void;
audioUrl?: string | null;
// audio generation callbacks - now with full parameters
onGenerateAI?: (params: {
text: string;
voice_id?: string;
speed?: number;
volume?: number;
pitch?: number;
emotion?: string;
}) => Promise<void>;
onGenerateFree?: (text: string) => Promise<void>;
}
// Available voice IDs from WaveSpeed Minimax
const AVAILABLE_VOICES = [
{ value: 'Wise_Woman', label: 'Wise Woman', description: 'Warm, authoritative female voice' },
{ value: 'Friendly_Person', label: 'Friendly Person', description: 'Approachable and conversational' },
{ value: 'Inspirational_girl', label: 'Inspirational Girl', description: 'Energetic and motivating' },
{ value: 'Deep_Voice_Man', label: 'Deep Voice Man', description: 'Rich, deep male voice' },
{ value: 'Calm_Woman', label: 'Calm Woman', description: 'Peaceful and soothing' },
{ value: 'Casual_Guy', label: 'Casual Guy', description: 'Relaxed and informal' },
{ value: 'Lively_Girl', label: 'Lively Girl', description: 'Vibrant and enthusiastic' },
{ value: 'Patient_Man', label: 'Patient Man', description: 'Steady and reassuring' },
{ value: 'Young_Knight', label: 'Young Knight', description: 'Brave and confident' },
{ value: 'Determined_Man', label: 'Determined Man', description: 'Strong and resolute' },
{ value: 'Lovely_Girl', label: 'Lovely Girl', description: 'Sweet and charming' },
{ value: 'Decent_Boy', label: 'Decent Boy', description: 'Polite and well-mannered' },
{ value: 'Imposing_Manner', label: 'Imposing Manner', description: 'Commanding and powerful' },
{ value: 'Elegant_Man', label: 'Elegant Man', description: 'Sophisticated and refined' },
{ value: 'Abbess', label: 'Abbess', description: 'Dignified and wise' },
{ value: 'Sweet_Girl_2', label: 'Sweet Girl 2', description: 'Gentle and kind' },
{ value: 'Exuberant_Girl', label: 'Exuberant Girl', description: 'Joyful and energetic' },
];
const EMOTIONS = [
{ value: 'happy', label: 'Happy', description: 'Cheerful and upbeat tone' },
{ value: 'sad', label: 'Sad', description: 'Melancholic and somber tone' },
{ value: 'angry', label: 'Angry', description: 'Intense and forceful tone' },
{ value: 'fear', label: 'Fear', description: 'Anxious and nervous tone' },
{ value: 'surprised', label: 'Surprised', description: 'Astonished and amazed tone' },
{ value: 'neutral', label: 'Neutral', description: 'Calm and balanced tone (default)' },
];
const AudioScriptModal: React.FC<AudioScriptModalProps> = ({
open, sceneNumber, value, onChange, onClose, onSave,
audioProvider, audioLang, audioSlow, audioRate,
onChangeProvider, onChangeLang, onChangeSlow, onChangeRate,
audioUrl,
onGenerateAI,
onGenerateFree,
}) => {
const [isGeneratingAI, setIsGeneratingAI] = React.useState(false);
const [isGeneratingFree, setIsGeneratingFree] = React.useState(false);
const [generateError, setGenerateError] = React.useState<string | null>(null);
// Audio type toggle - default to 'free'
const [audioType, setAudioType] = React.useState<'free' | 'ai'>('free');
// AI Audio generation parameters with intelligent defaults
const [voiceId, setVoiceId] = React.useState<string>('Wise_Woman');
const [customVoiceId, setCustomVoiceId] = React.useState<string>('');
const [useCustomVoice, setUseCustomVoice] = React.useState<boolean>(false);
const [emotion, setEmotion] = React.useState<string>('happy');
const [speed, setSpeed] = React.useState<number>(1.0);
const [volume, setVolume] = React.useState<number>(1.0);
const [pitch, setPitch] = React.useState<number>(0.0);
const handleGenerateAI = async () => {
if (!onGenerateAI || !value.trim()) {
return;
}
setIsGeneratingAI(true);
setGenerateError(null);
try {
await onGenerateAI({
text: value.trim(),
voice_id: useCustomVoice ? customVoiceId : voiceId,
emotion: emotion,
speed: speed,
volume: volume,
pitch: pitch,
});
// Optionally close modal after successful generation
// onClose();
} catch (err: any) {
setGenerateError(err?.response?.data?.detail || err?.message || 'Failed to generate AI audio');
} finally {
setIsGeneratingAI(false);
}
};
const handleGenerateFree = async () => {
if (!onGenerateFree || !value.trim()) {
return;
}
setIsGeneratingFree(true);
setGenerateError(null);
try {
await onGenerateFree(value.trim());
// Optionally close modal after successful generation
// onClose();
} catch (err: any) {
setGenerateError(err?.response?.data?.detail || err?.message || 'Failed to generate free audio');
} finally {
setIsGeneratingFree(false);
}
};
return (
<Dialog
open={open}
@@ -42,14 +151,43 @@ const AudioScriptModal: React.FC<AudioScriptModalProps> = ({
}}
>
<DialogTitle>Edit Audio Narration Script (Scene {sceneNumber})</DialogTitle>
<DialogContent dividers sx={{ color: '#2C2416' }}>
<DialogContent dividers sx={{ color: '#2C2416', bgcolor: '#fff' }}>
<Box
sx={{
display: 'flex',
flexDirection: 'column',
gap: 2,
'& .MuiFormLabel-root': { color: '#6b5846' },
'& .MuiInputBase-root': { color: '#2C2416' },
gap: 3,
pt: 1,
'& .MuiFormLabel-root': { color: '#5D4037', fontWeight: 500 },
'& .MuiInputBase-root': {
color: '#2C2416',
bgcolor: '#fff',
'& .MuiOutlinedInput-notchedOutline': {
borderColor: 'rgba(0, 0, 0, 0.23)',
},
'&:hover .MuiOutlinedInput-notchedOutline': {
borderColor: 'rgba(0, 0, 0, 0.87)',
},
'&.Mui-focused .MuiOutlinedInput-notchedOutline': {
borderColor: 'primary.main',
borderWidth: '2px',
},
},
'& .MuiInputBase-input': {
color: '#2C2416',
},
'& textarea': {
color: '#2C2416',
},
'& .MuiSelect-select': {
color: '#2C2416',
},
'& .MuiFormHelperText-root': {
color: 'rgba(0, 0, 0, 0.6)',
},
'& .MuiMenuItem-root': {
color: '#2C2416',
},
}}
>
{audioUrl ? (
@@ -73,40 +211,387 @@ const AudioScriptModal: React.FC<AudioScriptModalProps> = ({
multiline
minRows={6}
fullWidth
placeholder="Enter the narration text for this scene..."
sx={{
'& .MuiInputBase-input': {
color: '#2C2416',
},
}}
/>
<Box sx={{ display: 'grid', gridTemplateColumns: { xs: '1fr', md: '1fr 1fr' }, gap: 2 }}>
<TextField
select
label="Audio Provider"
value={audioProvider}
onChange={(e) => onChangeProvider(e.target.value)}
SelectProps={{ native: true }}
>
<option value="gtts">gTTS</option>
<option value="pyttsx3">pyttsx3</option>
</TextField>
<TextField
label="Language (e.g., en, hi)"
value={audioLang}
onChange={(e) => onChangeLang(e.target.value)}
/>
<TextField
select
label="Slow (gTTS)"
value={audioSlow ? 'true' : 'false'}
onChange={(e) => onChangeSlow(e.target.value === 'true')}
SelectProps={{ native: true }}
>
<option value="false">Normal</option>
<option value="true">Slow</option>
</TextField>
<TextField
type="number"
label="Rate (pyttsx3)"
value={audioRate}
onChange={(e) => onChangeRate(Number(e.target.value))}
inputProps={{ min: 50, max: 300, step: 10 }}
/>
{generateError && (
<Box sx={{ color: 'error.main', fontSize: '0.875rem', mt: -1 }}>
{generateError}
</Box>
)}
<Divider sx={{ my: 1 }} />
{/* Audio Type Toggle */}
<Box sx={{ display: 'flex', flexDirection: 'column', gap: 2 }}>
<Box>
<Typography variant="subtitle2" sx={{ mb: 1.5, fontWeight: 600, color: '#5D4037' }}>
Audio Type
</Typography>
<ToggleButtonGroup
value={audioType}
exclusive
onChange={(_, newValue) => {
if (newValue !== null) {
setAudioType(newValue);
setGenerateError(null);
}
}}
aria-label="audio type"
fullWidth
sx={{
'& .MuiToggleButton-root': {
textTransform: 'none',
borderColor: 'rgba(0, 0, 0, 0.23)',
color: '#5D4037',
'&.Mui-selected': {
backgroundColor: 'primary.main',
color: '#fff',
'&:hover': {
backgroundColor: 'primary.dark',
},
},
'&:hover': {
backgroundColor: 'rgba(0, 0, 0, 0.04)',
},
},
}}
>
<ToggleButton value="free" aria-label="free audio">
<VolumeUpIcon sx={{ mr: 1 }} />
Free Audio (gTTS)
</ToggleButton>
<ToggleButton value="ai" aria-label="ai audio">
<SmartToyIcon sx={{ mr: 1 }} />
AI Audio (Minimax)
</ToggleButton>
</ToggleButtonGroup>
</Box>
{/* Generate Button - Context aware based on audio type */}
<Box sx={{ display: 'flex', gap: 2, flexWrap: 'wrap' }}>
{audioType === 'ai' && onGenerateAI && (
<OperationButton
operation={{
provider: 'audio',
model: 'minimax/speech-02-hd',
tokens_requested: value.trim().length, // Every character is 1 token
operation_type: 'audio_generation',
actual_provider_name: 'wavespeed',
}}
label="Generate AI Audio"
variant="contained"
size="medium"
startIcon={<SmartToyIcon />}
showCost={true}
checkOnHover={true}
checkOnMount={false}
onClick={handleGenerateAI}
disabled={isGeneratingAI || isGeneratingFree || !value.trim()}
loading={isGeneratingAI}
sx={{ flex: 1, minWidth: '200px' }}
/>
)}
{audioType === 'free' && onGenerateFree && (
<Button
variant="contained"
size="medium"
startIcon={isGeneratingFree ? <CircularProgress size={16} /> : <VolumeUpIcon />}
onClick={handleGenerateFree}
disabled={isGeneratingAI || isGeneratingFree || !value.trim()}
sx={{ flex: 1, minWidth: '200px' }}
>
{isGeneratingFree ? 'Generating...' : 'Generate Free Audio (gTTS)'}
</Button>
)}
</Box>
<Divider sx={{ my: 1 }} />
{/* Settings - Conditionally shown based on audio type */}
{audioType === 'ai' && (
<Box>
<Typography variant="subtitle2" sx={{ mb: 2, fontWeight: 600, color: '#5D4037' }}>
AI Audio Generation Settings
</Typography>
<Box sx={{ display: 'grid', gridTemplateColumns: { xs: '1fr', md: '1fr 1fr' }, gap: 2 }}>
{/* Voice Selection */}
<FormControl fullWidth>
<InputLabel>Voice</InputLabel>
<Select
value={useCustomVoice ? 'custom' : voiceId}
onChange={(e) => {
if (e.target.value === 'custom') {
setUseCustomVoice(true);
} else {
setUseCustomVoice(false);
setVoiceId(e.target.value);
}
}}
label="Voice"
renderValue={(value) => {
if (value === 'custom') {
return customVoiceId || 'Custom Voice ID';
}
const voice = AVAILABLE_VOICES.find(v => v.value === value);
return voice ? voice.label : value;
}}
>
{AVAILABLE_VOICES.map((voice) => (
<MenuItem key={voice.value} value={voice.value}>
<Box>
<Typography variant="body2" sx={{ fontWeight: 500 }}>
{voice.label}
</Typography>
<Typography variant="caption" sx={{ color: 'text.secondary' }}>
{voice.description}
</Typography>
</Box>
</MenuItem>
))}
<MenuItem value="custom">
<Box>
<Typography variant="body2" sx={{ fontWeight: 500, fontStyle: 'italic' }}>
Custom Voice ID...
</Typography>
<Typography variant="caption" sx={{ color: 'text.secondary' }}>
Use a voice ID from voice cloning
</Typography>
</Box>
</MenuItem>
</Select>
<FormHelperText>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 0.5 }}>
Choose a voice that matches your story's tone
<Tooltip
title={
<Box sx={{ p: 0.5 }}>
<Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
Current Voice ID: {voiceId}
</Typography>
<Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
You can use system voices above or enter a custom voice ID from voice cloning.
</Typography>
<Typography variant="caption" sx={{ display: 'block' }}>
Learn more:{' '}
<a
href="https://wavespeed.ai/models/minimax/voice-clone"
target="_blank"
rel="noopener noreferrer"
style={{ color: '#90caf9' }}
>
Voice Cloning Guide
</a>
</Typography>
</Box>
}
arrow
placement="top"
>
<InfoOutlinedIcon sx={{ fontSize: '0.875rem', color: 'text.secondary', cursor: 'help' }} />
</Tooltip>
</Box>
</FormHelperText>
</FormControl>
{/* Custom Voice ID Input (shown when custom voice is selected) */}
{useCustomVoice && (
<TextField
fullWidth
label="Custom Voice ID"
value={customVoiceId}
onChange={(e) => setCustomVoiceId(e.target.value)}
helperText="Enter your custom voice ID from voice cloning"
placeholder="your-custom-voice-id"
/>
)}
{/* Emotion Selection */}
<FormControl fullWidth>
<InputLabel>Emotion</InputLabel>
<Select
value={emotion}
onChange={(e) => setEmotion(e.target.value)}
label="Emotion"
>
{EMOTIONS.map((em) => (
<MenuItem key={em.value} value={em.value}>
<Box>
<Typography variant="body2">{em.label}</Typography>
<Typography variant="caption" sx={{ color: 'text.secondary' }}>
{em.description}
</Typography>
</Box>
</MenuItem>
))}
</Select>
<FormHelperText>
Select the emotional tone for the narration
</FormHelperText>
</FormControl>
{/* Speed Slider */}
<Box>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1 }}>
<Typography variant="body2" sx={{ minWidth: '60px' }}>
Speed
</Typography>
<Slider
value={speed}
onChange={(_, newValue) => setSpeed(newValue as number)}
min={0.5}
max={2.0}
step={0.1}
valueLabelDisplay="auto"
valueLabelFormat={(value) => `${value}x`}
sx={{ flex: 1 }}
/>
<Typography variant="body2" sx={{ minWidth: '40px', textAlign: 'right' }}>
{speed.toFixed(1)}x
</Typography>
</Box>
<FormHelperText>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 0.5 }}>
Speech speed (0.5x = slow, 1.0x = normal, 2.0x = fast)
<Tooltip
title="Adjust how fast the narration speaks. 1.0 is normal speed, suitable for most content."
arrow
placement="top"
>
<InfoOutlinedIcon sx={{ fontSize: '0.875rem', color: 'text.secondary', cursor: 'help' }} />
</Tooltip>
</Box>
</FormHelperText>
</Box>
{/* Volume Slider */}
<Box>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1 }}>
<Typography variant="body2" sx={{ minWidth: '60px' }}>
Volume
</Typography>
<Slider
value={volume}
onChange={(_, newValue) => setVolume(newValue as number)}
min={0.1}
max={10.0}
step={0.1}
valueLabelDisplay="auto"
valueLabelFormat={(value) => `${value.toFixed(1)}`}
sx={{ flex: 1 }}
/>
<Typography variant="body2" sx={{ minWidth: '40px', textAlign: 'right' }}>
{volume.toFixed(1)}
</Typography>
</Box>
<FormHelperText>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 0.5 }}>
Audio volume level (0.1 = quiet, 1.0 = normal, 10.0 = loud)
<Tooltip
title="Control the loudness of the audio. 1.0 is standard volume. Increase for emphasis, decrease for subtlety."
arrow
placement="top"
>
<InfoOutlinedIcon sx={{ fontSize: '0.875rem', color: 'text.secondary', cursor: 'help' }} />
</Tooltip>
</Box>
</FormHelperText>
</Box>
{/* Pitch Slider */}
<Box sx={{ gridColumn: { xs: '1', md: '1 / -1' } }}>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1 }}>
<Typography variant="body2" sx={{ minWidth: '60px' }}>
Pitch
</Typography>
<Slider
value={pitch}
onChange={(_, newValue) => setPitch(newValue as number)}
min={-12}
max={12}
step={1}
valueLabelDisplay="auto"
valueLabelFormat={(value) => `${value > 0 ? '+' : ''}${value}`}
marks={[
{ value: -12, label: '-12' },
{ value: 0, label: '0' },
{ value: 12, label: '+12' },
]}
sx={{ flex: 1 }}
/>
<Typography variant="body2" sx={{ minWidth: '50px', textAlign: 'right' }}>
{pitch > 0 ? '+' : ''}{pitch}
</Typography>
</Box>
<FormHelperText>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 0.5 }}>
Voice pitch adjustment (-12 = lower, 0 = normal, +12 = higher)
<Tooltip
title="Adjust the pitch of the voice. Negative values make the voice deeper, positive values make it higher. 0 keeps the natural voice pitch."
arrow
placement="top"
>
<InfoOutlinedIcon sx={{ fontSize: '0.875rem', color: 'text.secondary', cursor: 'help' }} />
</Tooltip>
</Box>
</FormHelperText>
</Box>
</Box>
</Box>
)}
{audioType === 'free' && (
<Box>
<Typography variant="subtitle2" sx={{ mb: 2, fontWeight: 600, color: '#5D4037' }}>
Free Audio (gTTS) Settings
</Typography>
<Box sx={{ display: 'grid', gridTemplateColumns: { xs: '1fr', md: '1fr 1fr' }, gap: 2 }}>
<TextField
select
label="Audio Provider"
value={audioProvider}
onChange={(e) => onChangeProvider(e.target.value)}
SelectProps={{ native: true }}
helperText="Text-to-speech engine for free audio generation"
>
<option value="gtts">gTTS (Google Text-to-Speech)</option>
<option value="pyttsx3">pyttsx3 (Offline)</option>
</TextField>
<TextField
label="Language"
value={audioLang}
onChange={(e) => onChangeLang(e.target.value)}
helperText="Language code (e.g., en for English, hi for Hindi)"
placeholder="en"
/>
<TextField
select
label="Speech Speed (gTTS)"
value={audioSlow ? 'true' : 'false'}
onChange={(e) => onChangeSlow(e.target.value === 'true')}
SelectProps={{ native: true }}
helperText="Whether to speak slowly (useful for clarity)"
>
<option value="false">Normal Speed</option>
<option value="true">Slow Speed</option>
</TextField>
<TextField
type="number"
label="Speech Rate (pyttsx3)"
value={audioRate}
onChange={(e) => onChangeRate(Number(e.target.value))}
inputProps={{ min: 50, max: 300, step: 10 }}
helperText="Words per minute (50-300, default: 150)"
/>
</Box>
</Box>
)}
</Box>
</Box>
</DialogContent>

View File

@@ -1,12 +1,17 @@
import React from 'react';
import { Box, Typography, Tooltip, Chip } from '@mui/material';
import { Box, Typography, Tooltip, Chip, CircularProgress } from '@mui/material';
import { motion, AnimatePresence } from 'framer-motion';
import OutlineHoverActions from './OutlineHoverActions';
import EditNoteIcon from '@mui/icons-material/EditNote';
import VolumeUpIcon from '@mui/icons-material/VolumeUp';
import TipsAndUpdatesIcon from '@mui/icons-material/TipsAndUpdates';
import PlayArrowIcon from '@mui/icons-material/PlayArrow';
import GraphicEqIcon from '@mui/icons-material/GraphicEq';
import ReplayIcon from '@mui/icons-material/Replay';
import { OperationButton } from '../../../shared/OperationButton';
import { leftPageVariants, rightPageVariants } from './pageVariants';
import { storyWriterApi, StoryScene } from '../../../../services/storyWriterApi';
import { StoryScene } from '../../../../services/storyWriterApi';
import type { SceneAnimationResume } from '../../../../hooks/useStoryWriterState';
const MotionBox = motion(Box);
@@ -33,12 +38,19 @@ interface BookPagesProps {
narrationEnabled: boolean;
audioUrl: string | null;
hasAudio: boolean;
onOpenImageModal: () => void;
onOpenAudioModal: () => void;
onOpenCharactersModal: () => void;
onOpenKeyEventsModal: () => void;
onOpenTitleModal: () => void;
onOpenEditModal: () => void;
onAnimateScene?: () => void;
onResumeScene?: () => void;
onAnimateWithVoiceover?: () => void;
isAnimatingScene?: boolean;
animatedVideoUrl?: string | null;
resumeInfo?: SceneAnimationResume | null;
}
const BookPages: React.FC<BookPagesProps> = ({
@@ -56,12 +68,22 @@ const BookPages: React.FC<BookPagesProps> = ({
onOpenImageModal,
onOpenAudioModal,
audioUrl,
hasAudio,
onOpenCharactersModal,
onOpenKeyEventsModal,
onOpenTitleModal,
onOpenEditModal,
onAnimateScene,
onResumeScene,
onAnimateWithVoiceover,
isAnimatingScene,
animatedVideoUrl,
resumeInfo,
}) => {
const currentSceneNumber = currentScene?.scene_number || currentSceneIndex + 1;
const showAnimatedVideo = Boolean(animatedVideoUrl);
const hasImage = Boolean(imageUrl);
const hasMedia = showAnimatedVideo || hasImage;
return (
<Box sx={{ mb: 4, display: 'flex', justifyContent: 'center' }}>
@@ -213,13 +235,43 @@ const BookPages: React.FC<BookPagesProps> = ({
overflowY: 'auto',
mt: 3,
display: 'grid',
gridTemplateRows: imageUrl ? 'auto 1fr auto auto' : 'auto auto auto 1fr',
gridTemplateRows: hasMedia ? 'auto 1fr auto auto' : 'auto auto auto 1fr',
alignContent: 'start',
gap: 3,
}}
>
<Box sx={{ position: 'relative', '&:hover .left-image-actions': { opacity: 1, pointerEvents: 'auto' } }}>
{imageUrl ? (
{showAnimatedVideo ? (
<Box
sx={{
width: '100%',
borderRadius: '12px',
overflow: 'hidden',
boxShadow: '0 12px 24px rgba(0, 0, 0, 0.2)',
border: '3px solid rgba(120, 90, 60, 0.25)',
backgroundColor: '#000',
}}
>
<Box
component="video"
src={animatedVideoUrl ?? undefined}
poster={imageUrl ?? undefined}
autoPlay
muted
loop
controls
playsInline
sx={{
width: '100%',
height: 'auto',
display: 'block',
minHeight: '300px',
maxHeight: '500px',
objectFit: 'cover',
}}
/>
</Box>
) : hasImage ? (
<>
{/* Removed 'Scene Illustration' heading for cleaner look */}
<Box
@@ -239,7 +291,7 @@ const BookPages: React.FC<BookPagesProps> = ({
>
<Box
component="img"
src={imageUrl}
src={imageUrl || undefined}
alt={currentScene?.title || `Scene ${currentSceneNumber} illustration`}
sx={{
width: '100%',
@@ -258,11 +310,13 @@ const BookPages: React.FC<BookPagesProps> = ({
top: 8,
right: 8,
display: 'flex',
flexDirection: 'column',
gap: 1,
opacity: 0,
pointerEvents: 'none',
transition: 'opacity 0.2s ease',
zIndex: 5,
zIndex: 5,
alignItems: 'flex-end',
}}
>
<Tooltip title="Edit scene image prompt">
@@ -286,6 +340,152 @@ const BookPages: React.FC<BookPagesProps> = ({
<EditNoteIcon />
</Box>
</Tooltip>
{hasImage && onAnimateScene && (
<Box
onClick={(e) => {
e.stopPropagation();
}}
sx={{ display: 'inline-flex', pointerEvents: 'auto' }}
>
<OperationButton
operation={{
provider: 'video',
model: 'kling-v2.5-turbo-std-5s',
operation_type: 'scene_animation',
actual_provider_name: 'wavespeed',
}}
label="Animate Scene"
variant="contained"
size="small"
startIcon={<PlayArrowIcon />}
showCost
checkOnHover
checkOnMount={false}
onClick={onAnimateScene}
disabled={isAnimatingScene}
sx={{
minWidth: 'auto',
padding: '8px',
width: 40,
height: 40,
borderRadius: '50%',
background: 'linear-gradient(135deg, #1f8a70 0%, #32d9c8 100%)',
boxShadow: '0 8px 16px rgba(31,138,112,0.35)',
color: 'white',
'&:hover': {
background: 'linear-gradient(135deg, #1a7a60 0%, #2dc9b8 100%)',
},
'& .MuiButton-startIcon': {
margin: 0,
},
'& .MuiButton-label': {
display: 'none',
},
}}
tooltipPlacement="left"
/>
</Box>
)}
{hasImage && hasAudio && onAnimateWithVoiceover && (
<Box
onClick={(e) => {
e.stopPropagation();
}}
sx={{ display: 'inline-flex', pointerEvents: 'auto' }}
>
<OperationButton
operation={{
provider: 'video',
model: 'wavespeed-ai/infinitetalk',
operation_type: 'scene_animation_voiceover',
actual_provider_name: 'wavespeed',
}}
label="Animate with Voiceover"
variant="contained"
size="small"
startIcon={<GraphicEqIcon />}
showCost
checkOnHover
checkOnMount={false}
onClick={onAnimateWithVoiceover}
disabled={isAnimatingScene}
sx={{
minWidth: 'auto',
padding: '8px',
width: 40,
height: 40,
borderRadius: '50%',
background: 'linear-gradient(135deg, #733dd9 0%, #bb86fc 100%)',
boxShadow: '0 8px 16px rgba(115,61,217,0.35)',
color: 'white',
'&:hover': {
background: 'linear-gradient(135deg, #6030ba 0%, #a974f1 100%)',
},
'& .MuiButton-startIcon': {
margin: 0,
},
'& .MuiButton-label': {
display: 'none',
},
}}
tooltipPlacement="left"
/>
</Box>
)}
{resumeInfo && onResumeScene && (
<Tooltip
title={resumeInfo.message || 'Resume animation download (no extra cost)'}
placement="left"
>
<Box
onClick={(e) => {
e.stopPropagation();
}}
sx={{ display: 'inline-flex', pointerEvents: 'auto' }}
>
<OperationButton
operation={{
provider: 'video',
model: 'kling-v2.5-turbo-std-resume',
operation_type: 'scene_animation_resume',
actual_provider_name: 'wavespeed',
}}
label="Resume Animation"
variant="contained"
size="small"
startIcon={<ReplayIcon />}
showCost={false}
checkOnHover={false}
checkOnMount={false}
onClick={onResumeScene}
disabled={isAnimatingScene}
sx={{
minWidth: 'auto',
padding: '8px',
width: 40,
height: 40,
borderRadius: '50%',
background: 'linear-gradient(135deg, #b35c1e 0%, #f5a623 100%)',
boxShadow: '0 8px 16px rgba(179,92,30,0.35)',
color: 'white',
'&:hover': {
background: 'linear-gradient(135deg, #9c511a 0%, #e1911c 100%)',
},
'& .MuiButton-startIcon': {
margin: 0,
},
'& .MuiButton-label': {
display: 'none',
},
}}
tooltipPlacement="left"
/>
</Box>
</Tooltip>
)}
</Box>
</Box>
</>
@@ -325,6 +525,27 @@ const BookPages: React.FC<BookPagesProps> = ({
</Box>
</>
)}
{isAnimatingScene && (
<Box
sx={{
position: 'absolute',
inset: 0,
display: 'flex',
flexDirection: 'column',
alignItems: 'center',
justifyContent: 'center',
backdropFilter: 'blur(2px)',
backgroundColor: 'rgba(0,0,0,0.35)',
borderRadius: '12px',
color: '#fff',
gap: 1,
zIndex: 6,
}}
>
<CircularProgress color="inherit" size={36} />
<Typography variant="body2">Animating scene...</Typography>
</Box>
)}
</Box>
{/* Audio chip moved to right page */}
@@ -375,7 +596,10 @@ const BookPages: React.FC<BookPagesProps> = ({
'&:hover .chip-actions': { opacity: 1, pointerEvents: 'auto' },
}}
>
<OutlineHoverActions onEdit={onOpenEditModal} onImprove={onOpenEditModal} />
<OutlineHoverActions
onEdit={onOpenEditModal}
onImprove={onOpenEditModal}
/>
<Box sx={{ flex: 1, overflowY: 'auto', pt: { xs: 1, md: 2 } }}>
<Box className="chip-actions" sx={{ display: 'flex', gap: 1, flexWrap: 'wrap', mb: 1.5, opacity: 0, pointerEvents: 'none', transition: 'opacity 0.2s ease' }}>
<Chip

View File

@@ -1,5 +1,9 @@
import React from 'react';
import { Box, Button, Dialog, DialogActions, DialogContent, DialogTitle, TextField } from '@mui/material';
import { Box, Button, Dialog, DialogActions, DialogContent, DialogTitle, TextField, Divider, CircularProgress } from '@mui/material';
import { OperationButton } from '../../../shared/OperationButton';
import AutoFixHighIcon from '@mui/icons-material/AutoFixHigh';
import RefreshIcon from '@mui/icons-material/Refresh';
import { storyWriterApi } from '../../../../services/storyWriterApi';
interface ImageEditModalProps {
open: boolean;
@@ -8,9 +12,82 @@ interface ImageEditModalProps {
onChange: (v: string) => void;
onClose: () => void;
onSave: () => void;
onRegenerate?: (prompt: string) => Promise<void>;
imageProvider?: string | null;
imageWidth?: number;
imageHeight?: number;
imageModel?: string | null;
}
const ImageEditModal: React.FC<ImageEditModalProps> = ({ open, sceneNumber, value, onChange, onClose, onSave }) => {
const ImageEditModal: React.FC<ImageEditModalProps> = ({
open,
sceneNumber,
value,
onChange,
onClose,
onSave,
onRegenerate,
imageProvider,
imageWidth = 1024,
imageHeight = 1024,
imageModel,
}) => {
const [isRegenerating, setIsRegenerating] = React.useState(false);
const [regenerateError, setRegenerateError] = React.useState<string | null>(null);
const [isOptimizing, setIsOptimizing] = React.useState(false);
const [optimizeError, setOptimizeError] = React.useState<string | null>(null);
const handleRegenerate = async () => {
if (!onRegenerate || !value.trim()) {
return;
}
setIsRegenerating(true);
setRegenerateError(null);
try {
await onRegenerate(value.trim());
// Optionally close modal after successful regeneration
// onClose();
} catch (err: any) {
setRegenerateError(err?.response?.data?.detail || err?.message || 'Failed to regenerate image');
} finally {
setIsRegenerating(false);
}
};
const handleOptimize = async () => {
if (!value.trim()) {
return;
}
setIsOptimizing(true);
setOptimizeError(null);
try {
const response = await storyWriterApi.optimizePrompt({
text: value.trim(),
mode: 'image', // Default to image mode for scene image prompts
style: 'default', // Could be made configurable in the future
});
if (response.success && response.optimized_prompt) {
onChange(response.optimized_prompt);
} else {
throw new Error('Optimization returned no result');
}
} catch (err: any) {
const errorMessage = err?.response?.data?.detail || err?.message || 'Failed to optimize prompt';
setOptimizeError(errorMessage);
console.error('Failed to optimize prompt:', err);
} finally {
setIsOptimizing(false);
}
};
// Determine the model for cost estimation
// Default to FLUX.1-Krea-dev for HuggingFace, or stability model
const modelForEstimation = imageModel || (imageProvider === 'stability' ? 'stable-diffusion' : 'black-forest-labs/FLUX.1-Krea-dev');
const providerForEstimation = imageProvider || 'huggingface';
return (
<Dialog
open={open}
@@ -44,7 +121,54 @@ const ImageEditModal: React.FC<ImageEditModalProps> = ({ open, sceneNumber, valu
multiline
minRows={5}
fullWidth
placeholder="Enter a detailed description of the scene image..."
/>
{(regenerateError || optimizeError) && (
<Box sx={{ color: 'error.main', fontSize: '0.875rem', mt: -1 }}>
{regenerateError || optimizeError}
</Box>
)}
<Divider sx={{ my: 1 }} />
<Box sx={{ display: 'flex', gap: 2, flexWrap: 'wrap' }}>
{/* AI Prompt Optimizer */}
<Button
variant="outlined"
size="medium"
startIcon={isOptimizing ? <CircularProgress size={16} /> : <AutoFixHighIcon />}
onClick={handleOptimize}
disabled={isOptimizing || !value.trim() || isRegenerating}
sx={{ flex: 1, minWidth: '200px' }}
>
{isOptimizing ? 'Optimizing...' : 'AI Prompt Optimizer'}
</Button>
{/* Regenerate Scene - Active with cost estimation */}
{onRegenerate && (
<OperationButton
operation={{
provider: 'stability',
model: modelForEstimation,
tokens_requested: 0,
operation_type: 'image_generation',
actual_provider_name: providerForEstimation,
}}
label="Regenerate Scene"
variant="contained"
size="medium"
startIcon={<RefreshIcon />}
showCost={true}
checkOnHover={true}
checkOnMount={false}
onClick={handleRegenerate}
disabled={isRegenerating || !value.trim()}
loading={isRegenerating}
sx={{ flex: 1, minWidth: '200px' }}
/>
)}
</Box>
</Box>
</DialogContent>
<DialogActions>

View File

@@ -8,7 +8,10 @@ interface OutlineHoverActionsProps {
onImprove: () => void;
}
const OutlineHoverActions: React.FC<OutlineHoverActionsProps> = ({ onEdit, onImprove }) => {
const OutlineHoverActions: React.FC<OutlineHoverActionsProps> = ({
onEdit,
onImprove,
}) => {
return (
<Box
className="outline-actions"

View File

@@ -13,6 +13,7 @@ import { useStoryWriterState } from '../../../hooks/useStoryWriterState';
import { storyWriterApi } from '../../../services/storyWriterApi';
import { triggerSubscriptionError } from '../../../api/client';
import { aiApiClient } from '../../../api/client';
import { fetchMediaBlobUrl } from '../../../utils/fetchMediaBlobUrl';
import { MultimediaSection } from '../components/MultimediaSection';
const MotionBox = motion(Box);
@@ -123,10 +124,13 @@ const StoryWriting: React.FC<StoryWritingProps> = ({ state, onNext }) => {
const [pageDirection, setPageDirection] = useState(0);
const [imageLoadError, setImageLoadError] = useState<Set<number>>(new Set());
const [imageBlobUrls, setImageBlobUrls] = useState<Map<number, string>>(new Map());
const [videoBlobUrls, setVideoBlobUrls] = useState<Map<number, string>>(new Map());
const [videoLoadError, setVideoLoadError] = useState<Set<number>>(new Set());
// Get scenes and images from state
const scenes = state.outlineScenes || [];
const sceneImages = state.sceneImages || new Map<number, string>();
const sceneAnimatedVideos = state.sceneAnimatedVideos || new Map<number, string>();
const hasScenes = state.isOutlineStructured && scenes.length > 0;
// Split story content into sections mapped to scenes
@@ -201,6 +205,10 @@ const StoryWriting: React.FC<StoryWritingProps> = ({ state, onNext }) => {
}, []);
const currentSceneImageFullUrl = imageBlobUrls.get(currentSceneNumber) || null;
const currentSceneAnimatedVideoUrl = sceneAnimatedVideos.get(currentSceneNumber) || null;
const currentSceneAnimatedVideoBlobUrl = videoBlobUrls.get(currentSceneNumber) || null;
const hasVideoLoadError = videoLoadError.has(currentSceneNumber);
const showAnimatedVideo = Boolean(currentSceneAnimatedVideoBlobUrl);
// Reset image load error when page changes
useEffect(() => {
@@ -211,6 +219,60 @@ const StoryWriting: React.FC<StoryWritingProps> = ({ state, onNext }) => {
});
}, [currentSceneNumber]);
useEffect(() => {
if (!currentSceneAnimatedVideoUrl || hasVideoLoadError || currentSceneAnimatedVideoBlobUrl) {
return;
}
let cancelled = false;
const loadVideo = async () => {
try {
const videoPath = currentSceneAnimatedVideoUrl.startsWith('/')
? currentSceneAnimatedVideoUrl
: `/${currentSceneAnimatedVideoUrl}`;
const blobUrl = await fetchMediaBlobUrl(videoPath);
if (!blobUrl || cancelled) {
if (!blobUrl) {
setVideoLoadError((prev) => new Set(prev).add(currentSceneNumber));
}
return;
}
setVideoBlobUrls((prev) => {
const next = new Map(prev);
const existing = next.get(currentSceneNumber);
if (existing) {
URL.revokeObjectURL(existing);
}
next.set(currentSceneNumber, blobUrl);
return next;
});
} catch (err) {
console.warn('Failed to load animated video:', err);
setVideoLoadError((prev) => {
const next = new Set(prev);
next.add(currentSceneNumber);
return next;
});
}
};
loadVideo();
return () => {
cancelled = true;
};
}, [currentSceneNumber, currentSceneAnimatedVideoUrl, currentSceneAnimatedVideoBlobUrl, hasVideoLoadError]);
useEffect(() => {
return () => {
videoBlobUrls.forEach((blob) => {
URL.revokeObjectURL(blob);
});
};
}, [videoBlobUrls]);
useEffect(() => {
if (storySections.length > 0) {
setCurrentPageIndex(0);
@@ -502,7 +564,37 @@ const StoryWriting: React.FC<StoryWritingProps> = ({ state, onNext }) => {
},
}}
>
{currentSceneImageFullUrl ? (
{showAnimatedVideo ? (
<Box
sx={{
width: '100%',
borderRadius: '12px',
overflow: 'hidden',
boxShadow: '0 8px 20px rgba(0, 0, 0, 0.18), 0 4px 8px rgba(0, 0, 0, 0.12)',
border: '3px solid rgba(120, 90, 60, 0.25)',
backgroundColor: '#000',
}}
>
<Box
component="video"
src={currentSceneAnimatedVideoBlobUrl ?? undefined}
poster={currentSceneImageFullUrl ?? undefined}
autoPlay
muted
loop
controls
playsInline
sx={{
width: '100%',
height: 'auto',
display: 'block',
minHeight: '300px',
maxHeight: '500px',
objectFit: 'cover',
}}
/>
</Box>
) : currentSceneImageFullUrl ? (
<Box
sx={{
width: '100%',

View File

@@ -123,23 +123,38 @@ export const StoryWriter: React.FC = () => {
setIsGeneratingVideo(true);
try {
const imageUrls: string[] = [];
const imageUrls: (string | null)[] = [];
const audioUrls: string[] = [];
const scenes = state.outlineScenes;
const videoUrls: (string | null)[] = [];
for (const scene of scenes) {
const sceneNumber = scene.scene_number || scenes.indexOf(scene) + 1;
const imageUrl = state.sceneImages?.get(sceneNumber);
const audioUrl = state.sceneAudio?.get(sceneNumber);
const animatedVideoUrl = state.sceneAnimatedVideos?.get(sceneNumber);
if (imageUrl && audioUrl) {
imageUrls.push(imageUrl);
audioUrls.push(audioUrl);
if (!audioUrl) {
continue; // Skip scenes without audio
}
// Prefer animated video if available, otherwise use image
if (animatedVideoUrl) {
videoUrls.push(animatedVideoUrl);
imageUrls.push(null);
} else if (imageUrl) {
videoUrls.push(null);
imageUrls.push(imageUrl);
} else {
continue; // Skip scenes without image or video
}
audioUrls.push(audioUrl);
}
if (imageUrls.length !== scenes.length || audioUrls.length !== scenes.length) {
throw new Error('Number of images and audio files must match number of scenes');
throw new Error('Number of images/videos and audio files must match number of scenes');
}
// Switch to async flow so UI can poll progress messages
@@ -147,6 +162,8 @@ export const StoryWriter: React.FC = () => {
scenes: scenes,
image_urls: imageUrls,
audio_urls: audioUrls,
video_urls: videoUrls.length > 0 ? videoUrls : undefined,
ai_audio_urls: undefined, // TODO: Track AI audio separately in state
story_title: state.storySetting || 'Story',
fps: state.videoFps,
transition_duration: state.videoTransitionDuration,

View File

@@ -29,14 +29,30 @@ export const AudioPlayerList: React.FC<AudioPlayerListProps> = ({ scenes, sceneA
for (const [sceneNumber, audioPath] of entries) {
if (!audioPath) continue;
try {
const normalizedPath = audioPath.startsWith('/') ? audioPath : `/${audioPath}`;
// Normalize path - ensure it starts with /api/story/audio/
let normalizedPath = audioPath.startsWith('/') ? audioPath : `/${audioPath}`;
// If path doesn't include /api/story/audio/, add it
if (!normalizedPath.includes('/api/story/audio/')) {
// Extract filename from path
const filename = audioPath.split('/').pop() || audioPath;
normalizedPath = `/api/story/audio/${filename}`;
}
const response = await aiApiClient.get(normalizedPath, {
responseType: 'blob',
});
const blobUrl = URL.createObjectURL(response.data);
blobEntries.push([sceneNumber, blobUrl]);
} catch (err) {
console.error('Failed to load audio blob:', err);
} catch (err: any) {
console.error(`Failed to load audio blob for scene ${sceneNumber}:`, err);
console.error(`Audio path was: ${audioPath}`);
console.error(`Normalized path would be: ${audioPath.startsWith('/') ? audioPath : `/${audioPath}`}`);
// If auth error, log more details
if (err?.response?.status === 401) {
console.error(`Authentication failed for audio file. Make sure auth token is set.`);
}
}
}
@@ -87,13 +103,19 @@ export const AudioPlayerList: React.FC<AudioPlayerListProps> = ({ scenes, sceneA
<Typography variant="subtitle2" sx={{ mb: 1, fontWeight: 600, color: '#1A1611' }}>
Scene {sceneNumber}: {scene.title || `Scene ${sceneNumber}`}
</Typography>
<audio
controls
src={blobUrl ? blobUrl : storyWriterApi.getAudioUrl(audioUrl)}
style={{ width: '100%' }}
>
Your browser does not support the audio element.
</audio>
{blobUrl ? (
<audio
controls
src={blobUrl}
style={{ width: '100%' }}
>
Your browser does not support the audio element.
</audio>
) : (
<Typography variant="body2" sx={{ color: 'text.secondary', fontStyle: 'italic' }}>
Loading audio...
</Typography>
)}
</Box>
);
})}

View File

@@ -2,15 +2,14 @@ import React, { useState, useRef } from 'react';
import {
Box,
Typography,
Button,
Alert,
LinearProgress,
Tooltip,
} from '@mui/material';
import SmartDisplayIcon from '@mui/icons-material/SmartDisplay';
import { useStoryWriterState } from '../../../hooks/useStoryWriterState';
import { storyWriterApi } from '../../../services/storyWriterApi';
import { triggerSubscriptionError } from '../../../api/client';
import { OperationButton } from '../../shared/OperationButton';
import SceneVideoApproval from './SceneVideoApproval';
// Simple logger for frontend
@@ -94,14 +93,11 @@ export const HdVideoSection: React.FC<HdVideoSectionProps> = ({ state, onError }
setHdVideoMessage(`Generating HD video for Scene ${sceneNumber}...`);
try {
const sceneImageUrl = state.sceneImages?.get(sceneNumber);
const result = await storyWriterApi.generateHdVideoScene({
scene_number: sceneNumber,
scene_data: scene,
story_context: storyContext,
all_scenes: scenes,
scene_image_url: sceneImageUrl,
provider: 'huggingface',
model: 'tencent/HunyuanVideo',
num_frames: 50,
@@ -240,14 +236,11 @@ export const HdVideoSection: React.FC<HdVideoSectionProps> = ({ state, onError }
story_content: state.storyContent || '',
};
const sceneImageUrl = state.sceneImages?.get(sceneNumber);
const result = await storyWriterApi.generateHdVideoScene({
scene_number: sceneNumber,
scene_data: scene,
story_context: storyContext,
all_scenes: scenes,
scene_image_url: sceneImageUrl,
provider: 'huggingface',
model: 'tencent/HunyuanVideo',
num_frames: 50,
@@ -303,45 +296,30 @@ export const HdVideoSection: React.FC<HdVideoSectionProps> = ({ state, onError }
return (
<>
<Box sx={{ mt: 1, display: 'flex', flexDirection: 'column', gap: 1 }}>
<Tooltip
title={
<Box sx={{ p: 1 }}>
<Typography variant="body2" sx={{ mb: 1, fontWeight: 600 }}>
Generate HD Animation with AI
</Typography>
<Typography variant="caption" sx={{ display: 'block', mb: 1 }}>
Upgrade this storyboard into a highdefinition AI animation using Hugging Face texttovideo models.
Your draft was generated affordably (images + narration). This premium option uses an AI model to render motion.
</Typography>
<Typography variant="caption" sx={{ display: 'block', mb: 0.5, fontWeight: 600 }}>
Recommended models:
</Typography>
<Typography variant="caption" component="div" sx={{ display: 'block', mb: 1 }}>
tencent/HunyuanVideo<br />
Lightricks/LTX-Video<br />
Lightricks/LTX-Video-0.9.8-13B-distilled
</Typography>
<Typography variant="caption" sx={{ display: 'block', fontStyle: 'italic' }}>
This will generate HD videos for each scene one at a time. You'll review and approve each scene before the next one is generated.
</Typography>
</Box>
}
arrow
placement="top"
>
<span style={{ display: 'inline-flex' }}>
<Button
variant="contained"
startIcon={<SmartDisplayIcon />}
onClick={handleGenerateHdVideo}
disabled={isGeneratingHdVideo || state.hdVideoGenerationStatus === 'awaiting_approval'}
>
{isGeneratingHdVideo || state.hdVideoGenerationStatus === 'awaiting_approval'
? 'Generating HD Animation...'
: 'Generate HD Animation with AI'}
</Button>
</span>
</Tooltip>
<OperationButton
operation={{
provider: 'video',
model: 'tencent/HunyuanVideo',
tokens_requested: 0,
operation_type: 'video_generation',
actual_provider_name: 'huggingface',
}}
label="Generate HD Animation with AI"
variant="contained"
startIcon={<SmartDisplayIcon />}
showCost={true}
checkOnHover={true}
checkOnMount={false}
onClick={handleGenerateHdVideo}
disabled={isGeneratingHdVideo || state.hdVideoGenerationStatus === 'awaiting_approval'}
loading={isGeneratingHdVideo || state.hdVideoGenerationStatus === 'awaiting_approval'}
tooltipPlacement="top"
buttonProps={{
children: isGeneratingHdVideo || state.hdVideoGenerationStatus === 'awaiting_approval'
? 'Generating HD Animation...'
: undefined,
}}
/>
{(isGeneratingHdVideo || state.hdVideoGenerationStatus === 'generating' || state.hdVideoGenerationStatus === 'awaiting_approval') && (
<Box sx={{ mt: 2, p: 2, backgroundColor: '#FAF9F6', borderRadius: 1, border: '1px solid #E0DCD4' }}>

View File

@@ -40,7 +40,19 @@ export const VideoSection: React.FC<VideoSectionProps> = ({ state, error, onErro
// Load video blob URL when storyVideo changes
useEffect(() => {
if (state.storyVideo) {
fetchMediaBlobUrl(state.storyVideo).then(setVideoBlobUrl);
fetchMediaBlobUrl(state.storyVideo)
.then((blobUrl) => {
if (blobUrl) {
setVideoBlobUrl(blobUrl);
} else {
// File not found - clear the blob URL
setVideoBlobUrl(null);
}
})
.catch((err) => {
console.warn('Failed to load video blob:', err);
setVideoBlobUrl(null);
});
} else {
if (videoBlobUrl) {
URL.revokeObjectURL(videoBlobUrl);
@@ -76,31 +88,50 @@ export const VideoSection: React.FC<VideoSectionProps> = ({ state, error, onErro
setVideoMessage('');
try {
const imageUrls: string[] = [];
const imageUrls: (string | null)[] = [];
const audioUrls: string[] = [];
const scenes = state.outlineScenes;
const videoUrls: (string | null)[] = [];
const aiAudioUrls: (string | null)[] = [];
for (const scene of scenes) {
const sceneNumber = scene.scene_number || scenes.indexOf(scene) + 1;
const imageUrl = state.sceneImages?.get(sceneNumber);
const audioUrl = state.sceneAudio?.get(sceneNumber);
const animatedVideoUrl = state.sceneAnimatedVideos?.get(sceneNumber);
if (imageUrl && audioUrl) {
imageUrls.push(imageUrl);
audioUrls.push(audioUrl);
} else {
throw new Error(`Missing image or audio for scene ${sceneNumber}`);
if (!audioUrl) {
throw new Error(`Missing audio for scene ${sceneNumber}`);
}
// Prefer animated video if available, otherwise use image
if (animatedVideoUrl) {
videoUrls.push(animatedVideoUrl);
imageUrls.push(null);
} else if (imageUrl) {
videoUrls.push(null);
imageUrls.push(imageUrl);
} else {
throw new Error(`Missing image or animated video for scene ${sceneNumber}`);
}
audioUrls.push(audioUrl);
// AI audio detection: check if URL contains 'ai' or 'wavespeed' (can be enhanced later)
// For now, pass null and backend will use available audio
aiAudioUrls.push(null);
}
if (imageUrls.length !== scenes.length || audioUrls.length !== scenes.length) {
throw new Error('Number of images and audio files must match number of scenes');
throw new Error('Number of images/videos and audio files must match number of scenes');
}
const start = await storyWriterApi.generateStoryVideoAsync({
scenes: scenes,
image_urls: imageUrls,
audio_urls: audioUrls,
video_urls: videoUrls.length > 0 ? videoUrls : undefined,
ai_audio_urls: undefined, // TODO: Track AI audio separately in state
story_title: state.storySetting || 'Story',
fps: state.videoFps,
transition_duration: state.videoTransitionDuration,
@@ -122,7 +153,9 @@ export const VideoSection: React.FC<VideoSectionProps> = ({ state, error, onErro
if (!finalUrl) throw new Error('Video URL not found in result');
state.setStoryVideo(finalUrl);
const blobUrl = await fetchMediaBlobUrl(finalUrl);
setVideoBlobUrl(blobUrl);
if (blobUrl) {
setVideoBlobUrl(blobUrl);
}
setVideoProgress(100);
setVideoMessage('Video generation complete');
state.setError(null);
@@ -160,6 +193,10 @@ export const VideoSection: React.FC<VideoSectionProps> = ({ state, error, onErro
const handleDownloadVideo = async () => {
if (state.storyVideo) {
const blobUrl = await fetchMediaBlobUrl(state.storyVideo);
if (!blobUrl) {
// File not found - skip download
return;
}
const a = document.createElement('a');
a.href = blobUrl;
a.download = `story-video-${Date.now()}.mp4`;

View File

@@ -0,0 +1,273 @@
import React, { useMemo } from 'react';
import {
Button,
ButtonProps,
Tooltip,
Box,
Typography,
CircularProgress,
} from '@mui/material';
import WarningIcon from '@mui/icons-material/Warning';
import { SxProps, Theme } from '@mui/material/styles';
import { usePreflightCheck, UsePreflightCheckOptions } from '../../hooks/usePreflightCheck';
import { PreflightOperation } from '../../services/billingService';
export interface OperationButtonProps {
// Operation definition
operation: PreflightOperation;
// Button configuration
label: string; // Base label (e.g., "Generate HD Video")
variant?: 'contained' | 'outlined' | 'text';
size?: 'small' | 'medium' | 'large';
color?: 'primary' | 'secondary' | 'success' | 'error';
startIcon?: React.ReactNode;
endIcon?: React.ReactNode;
// Pre-flight check behavior
showCost?: boolean; // Show cost in label (default: true)
checkOnHover?: boolean; // Check on hover (default: true)
checkOnMount?: boolean; // Check on mount (default: false)
// Callbacks
onClick: () => void;
onPreflightResult?: (canProceed: boolean) => void;
// Customization
disabled?: boolean; // Additional disabled state
loading?: boolean; // Loading state override
tooltipPlacement?: 'top' | 'bottom' | 'left' | 'right';
// Styling
sx?: SxProps<Theme>;
fullWidth?: boolean;
// Additional button props
buttonProps?: Partial<ButtonProps>;
}
/**
* Reusable button component with pre-flight check and cost estimation.
*
* Features:
* - Shows estimated cost in button label
* - Performs pre-flight check on hover (debounced)
* - Shows detailed tooltip with limits/remaining quota
* - Disables button with messaging if blocked
*/
export const OperationButton: React.FC<OperationButtonProps> = ({
operation,
label,
variant = 'contained',
size = 'medium',
color = 'primary',
startIcon,
endIcon,
showCost = true,
checkOnHover = true,
checkOnMount = false,
onClick,
onPreflightResult,
disabled: externalDisabled = false,
loading: externalLoading = false,
tooltipPlacement = 'top',
sx,
fullWidth = false,
buttonProps = {},
}) => {
const preflightOptions: UsePreflightCheckOptions = {
operation,
enabled: checkOnHover || checkOnMount,
debounceMs: 300,
cacheTtl: 5000,
};
const {
canProceed,
estimatedCost,
limitInfo,
loading: preflightLoading,
error: preflightError,
checkOnHover: triggerCheckOnHover,
checkNow: triggerCheckNow,
} = usePreflightCheck(preflightOptions);
// Check on mount if requested
React.useEffect(() => {
if (checkOnMount) {
triggerCheckNow();
}
}, [checkOnMount, triggerCheckNow]);
// Notify parent of pre-flight result changes
React.useEffect(() => {
if (onPreflightResult) {
onPreflightResult(canProceed);
}
}, [canProceed, onPreflightResult]);
// Format cost as currency
const formattedCost = useMemo(() => {
if (!showCost || estimatedCost === 0) {
return null;
}
return new Intl.NumberFormat('en-US', {
style: 'currency',
currency: 'USD',
minimumFractionDigits: 2,
maximumFractionDigits: 2,
}).format(estimatedCost);
}, [estimatedCost, showCost]);
// Build button label with cost
const buttonLabel = useMemo(() => {
if (formattedCost) {
return `${label} ${formattedCost}`;
}
return label;
}, [label, formattedCost]);
// Determine if button should be disabled
const isDisabled = useMemo(() => {
return externalDisabled || externalLoading || preflightLoading || !canProceed;
}, [externalDisabled, externalLoading, preflightLoading, canProceed]);
// Build tooltip content
const tooltipContent = useMemo(() => {
const content: React.ReactNode[] = [];
if (preflightLoading) {
content.push(
<Typography key="loading" variant="body2" sx={{ mb: 1 }}>
Checking limits...
</Typography>
);
} else if (preflightError) {
content.push(
<Typography key="error" variant="body2" sx={{ mb: 1, color: 'error.main', fontWeight: 600 }}>
{preflightError}
</Typography>
);
} else if (limitInfo) {
const { current_usage, limit, remaining } = limitInfo;
const isUnlimited = limit === 0 || remaining === Infinity;
content.push(
<Box key="limits" sx={{ mb: 1 }}>
<Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
{canProceed ? '✅ Operation Allowed' : '❌ Operation Blocked'}
</Typography>
{isUnlimited ? (
<Typography variant="caption" sx={{ display: 'block' }}>
Usage: {current_usage} / Unlimited
</Typography>
) : (
<Typography variant="caption" sx={{ display: 'block' }}>
Usage: {current_usage} / {limit} ({remaining} remaining)
</Typography>
)}
{formattedCost && (
<Typography variant="caption" sx={{ display: 'block', mt: 0.5, fontWeight: 600 }}>
Estimated Cost: {formattedCost}
</Typography>
)}
</Box>
);
}
if (preflightError && !canProceed) {
content.push(
<Typography key="message" variant="caption" sx={{ display: 'block', color: 'error.main' }}>
{preflightError}
</Typography>
);
}
return content.length > 0 ? <Box sx={{ p: 0.5 }}>{content}</Box> : null;
}, [canProceed, estimatedCost, formattedCost, limitInfo, preflightError, preflightLoading]);
// Handle hover
const handleMouseEnter = () => {
if (checkOnHover) {
triggerCheckOnHover();
}
};
// Handle click
const handleClick = () => {
if (!isDisabled && canProceed) {
onClick();
}
};
// Determine button color based on state
const buttonColor = useMemo(() => {
if (!canProceed) {
return 'error';
}
return color;
}, [canProceed, color]);
// Determine if we should show loading spinner
const showLoading = externalLoading || (preflightLoading && checkOnMount);
// Custom label override for loading state
const displayLabel = useMemo(() => {
if (externalLoading && buttonProps?.children) {
return buttonProps.children;
}
if (showLoading && !externalLoading) {
return 'Checking...';
}
if (!canProceed && preflightError) {
return preflightError;
}
return buttonLabel;
}, [externalLoading, showLoading, canProceed, preflightError, buttonLabel, buttonProps?.children]);
// Build button with icon
const button = (
<Button
variant={variant}
size={size}
color={buttonColor}
startIcon={
showLoading ? (
<CircularProgress size={16} color="inherit" />
) : !canProceed ? (
<WarningIcon fontSize="small" />
) : (
startIcon
)
}
endIcon={endIcon}
onClick={handleClick}
disabled={isDisabled}
fullWidth={fullWidth}
onMouseEnter={handleMouseEnter}
sx={sx}
{...buttonProps}
>
{displayLabel}
</Button>
);
// Wrap with tooltip if we have content
if (tooltipContent || checkOnHover) {
return (
<Tooltip
title={tooltipContent || 'Hover to check limits'}
arrow
placement={tooltipPlacement}
onOpen={handleMouseEnter}
>
<span style={{ display: 'inline-flex' }}>
{button}
</span>
</Tooltip>
);
}
return button;
};

View File

@@ -0,0 +1,257 @@
import { useState, useCallback, useRef, useEffect } from 'react';
import {
checkPreflight,
PreflightOperation,
PreflightCheckResponse,
PreflightLimitInfo,
} from '../services/billingService';
export interface UsePreflightCheckOptions {
operation: PreflightOperation;
enabled?: boolean; // Whether to perform check on hover
debounceMs?: number; // Debounce delay (default: 300ms)
cacheTtl?: number; // Cache TTL in ms (default: 5000ms)
}
export interface UsePreflightCheckResult {
canProceed: boolean;
estimatedCost: number;
limitInfo: PreflightLimitInfo | null;
loading: boolean;
error: string | null;
checkOnHover: () => void;
checkNow: () => void; // Immediate check
reset: () => void;
}
interface CacheEntry {
data: PreflightCheckResponse;
timestamp: number;
}
/**
* React hook for pre-flight checking operations with cost estimation.
*
* Features:
* - Debounced hover checks (300ms default)
* - In-memory caching (5s default TTL)
* - Request cancellation on unmount
*/
export const usePreflightCheck = (
options: UsePreflightCheckOptions
): UsePreflightCheckResult => {
const {
operation,
enabled = true,
debounceMs = 300,
cacheTtl = 5000,
} = options;
const [canProceed, setCanProceed] = useState<boolean>(true);
const [estimatedCost, setEstimatedCost] = useState<number>(0);
const [limitInfo, setLimitInfo] = useState<PreflightLimitInfo | null>(null);
const [loading, setLoading] = useState<boolean>(false);
const [error, setError] = useState<string | null>(null);
// Cache for pre-flight check results
const cacheRef = useRef<Map<string, CacheEntry>>(new Map());
// Debounce timer ref
const debounceTimerRef = useRef<NodeJS.Timeout | null>(null);
// Abort controller for request cancellation
const abortControllerRef = useRef<AbortController | null>(null);
// Generate cache key from operation
const getCacheKey = useCallback(() => {
return JSON.stringify(operation);
}, [operation]);
// Check if cached result is still valid
const getCachedResult = useCallback((): PreflightCheckResponse | null => {
const cacheKey = getCacheKey();
const cached = cacheRef.current.get(cacheKey);
if (cached) {
const age = Date.now() - cached.timestamp;
if (age < cacheTtl) {
return cached.data;
}
// Cache expired, remove it
cacheRef.current.delete(cacheKey);
}
return null;
}, [getCacheKey, cacheTtl]);
// Store result in cache
const setCache = useCallback((data: PreflightCheckResponse) => {
const cacheKey = getCacheKey();
cacheRef.current.set(cacheKey, {
data,
timestamp: Date.now(),
});
}, [getCacheKey]);
// Perform actual pre-flight check
const performCheck = useCallback(async (): Promise<void> => {
if (!enabled) {
return;
}
// Check cache first
const cached = getCachedResult();
if (cached) {
updateState(cached);
return;
}
// Cancel any in-flight request
if (abortControllerRef.current) {
abortControllerRef.current.abort();
}
// Create new abort controller
abortControllerRef.current = new AbortController();
const currentAbortController = abortControllerRef.current;
setLoading(true);
setError(null);
try {
const response = await checkPreflight(operation);
// Check if request was cancelled
if (currentAbortController.signal.aborted) {
return;
}
// Cache the result
setCache(response);
// Update state
updateState(response);
} catch (err: any) {
// Check if request was cancelled
if (currentAbortController.signal.aborted) {
return;
}
const errorMessage = err?.message || 'Pre-flight check failed';
setError(errorMessage);
setCanProceed(false);
setEstimatedCost(0);
setLimitInfo(null);
} finally {
if (!currentAbortController.signal.aborted) {
setLoading(false);
}
}
}, [operation, enabled, getCachedResult, setCache]);
// Update state from response
const updateState = useCallback((response: PreflightCheckResponse) => {
setCanProceed(response.can_proceed);
setEstimatedCost(response.estimated_cost);
// Get limit info from first operation (for single operation checks)
const firstOp = response.operations[0];
if (firstOp) {
setLimitInfo(firstOp.limit_info);
if (!response.can_proceed && firstOp.message) {
setError(firstOp.message);
} else {
setError(null);
}
} else {
setLimitInfo(null);
}
}, []);
// Debounced check for hover events
const checkOnHover = useCallback(() => {
if (!enabled) {
return;
}
// Clear existing timer
if (debounceTimerRef.current) {
clearTimeout(debounceTimerRef.current);
}
// Check cache first (no debounce for cache hits)
const cached = getCachedResult();
if (cached) {
updateState(cached);
return;
}
// Debounce the actual API call
debounceTimerRef.current = setTimeout(() => {
performCheck();
}, debounceMs);
}, [enabled, debounceMs, getCachedResult, updateState, performCheck]);
// Immediate check (no debounce)
const checkNow = useCallback(() => {
if (!enabled) {
return;
}
// Clear any pending debounced check
if (debounceTimerRef.current) {
clearTimeout(debounceTimerRef.current);
debounceTimerRef.current = null;
}
performCheck();
}, [enabled, performCheck]);
// Reset state
const reset = useCallback(() => {
setCanProceed(true);
setEstimatedCost(0);
setLimitInfo(null);
setLoading(false);
setError(null);
// Clear debounce timer
if (debounceTimerRef.current) {
clearTimeout(debounceTimerRef.current);
debounceTimerRef.current = null;
}
// Cancel any in-flight request
if (abortControllerRef.current) {
abortControllerRef.current.abort();
abortControllerRef.current = null;
}
}, []);
// Cleanup on unmount
useEffect(() => {
return () => {
// Clear debounce timer
if (debounceTimerRef.current) {
clearTimeout(debounceTimerRef.current);
}
// Cancel any in-flight request
if (abortControllerRef.current) {
abortControllerRef.current.abort();
}
};
}, []);
return {
canProceed,
estimatedCost,
limitInfo,
loading,
error,
checkOnHover,
checkNow,
reset,
};
};

View File

@@ -7,6 +7,13 @@ import {
StoryFullGenerationResponse,
} from '../services/storyWriterApi';
export interface SceneAnimationResume {
predictionId: string;
duration: 5 | 10;
message?: string;
createdAt?: string;
}
export interface StoryWriterState {
// Story parameters (Setup phase)
persona: string;
@@ -52,6 +59,8 @@ export interface StoryWriterState {
sceneAudio: Map<number, string> | null; // Generated audio URLs by scene number
storyVideo: string | null; // Generated video URL
sceneHdVideos: Map<number, string> | null; // Approved HD video URLs by scene number
sceneAnimatedVideos: Map<number, string> | null; // Animated scene preview videos
sceneAnimationResumables: Map<number, SceneAnimationResume> | null; // Pending resume info per scene
hdVideoGenerationStatus: 'idle' | 'generating' | 'awaiting_approval' | 'completed' | 'paused';
currentHdSceneIndex: number; // Which scene is currently being generated/reviewed
@@ -104,6 +113,8 @@ const DEFAULT_STATE: Partial<StoryWriterState> = {
sceneAudio: null,
storyVideo: null,
sceneHdVideos: null,
sceneAnimatedVideos: null,
sceneAnimationResumables: null,
hdVideoGenerationStatus: 'idle',
currentHdSceneIndex: 0,
currentTaskId: null,
@@ -148,6 +159,8 @@ export const useStoryWriterState = () => {
sceneImages: parsed.sceneImages ? new Map(parsed.sceneImages) : null,
sceneAudio: parsed.sceneAudio ? new Map(parsed.sceneAudio) : null,
sceneHdVideos: parsed.sceneHdVideos ? new Map(parsed.sceneHdVideos) : null,
sceneAnimatedVideos: parsed.sceneAnimatedVideos ? new Map(parsed.sceneAnimatedVideos) : null,
sceneAnimationResumables: parsed.sceneAnimationResumables ? new Map(parsed.sceneAnimationResumables) : null,
};
return restoredState as StoryWriterState;
@@ -193,6 +206,12 @@ export const useStoryWriterState = () => {
sceneImages: persistableState.sceneImages ? Array.from(persistableState.sceneImages.entries()) : null,
sceneAudio: persistableState.sceneAudio ? Array.from(persistableState.sceneAudio.entries()) : null,
sceneHdVideos: persistableState.sceneHdVideos ? Array.from(persistableState.sceneHdVideos.entries()) : null,
sceneAnimatedVideos: persistableState.sceneAnimatedVideos
? Array.from(persistableState.sceneAnimatedVideos.entries())
: null,
sceneAnimationResumables: persistableState.sceneAnimationResumables
? Array.from(persistableState.sceneAnimationResumables.entries())
: null,
};
localStorage.setItem('story_writer_state', JSON.stringify(serializableState));
@@ -337,6 +356,14 @@ export const useStoryWriterState = () => {
setState((prev) => ({ ...prev, sceneImages: images }));
}, []);
const setSceneAnimatedVideos = useCallback((videos: Map<number, string> | null) => {
setState((prev) => ({ ...prev, sceneAnimatedVideos: videos }));
}, []);
const setSceneAnimationResumables = useCallback((resumables: Map<number, SceneAnimationResume> | null) => {
setState((prev) => ({ ...prev, sceneAnimationResumables: resumables }));
}, []);
const setSceneAudio = useCallback((audio: Map<number, string> | null) => {
setState((prev) => ({ ...prev, sceneAudio: audio }));
}, []);
@@ -471,6 +498,8 @@ export const useStoryWriterState = () => {
setSceneAudio,
setStoryVideo,
setSceneHdVideos,
setSceneAnimatedVideos,
setSceneAnimationResumables,
setHdVideoGenerationStatus,
setCurrentHdSceneIndex,
setCurrentTaskId,

View File

@@ -587,6 +587,127 @@ export const formatCurrency = (amount: number): string => {
}).format(amount);
};
// Pre-flight check interfaces
export interface PreflightOperation {
provider: string;
model?: string;
tokens_requested?: number;
operation_type: string;
actual_provider_name?: string;
}
export interface PreflightLimitInfo {
current_usage: number;
limit: number;
remaining: number;
}
export interface PreflightOperationResult {
provider: string;
operation_type: string;
cost: number;
allowed: boolean;
limit_info: PreflightLimitInfo | null;
message: string | null;
}
export interface PreflightCheckResponse {
can_proceed: boolean;
estimated_cost: number;
operations: PreflightOperationResult[];
total_cost: number;
usage_summary: {
current_calls: number;
limit: number;
remaining: number;
} | null;
cached: boolean;
}
/**
* Check pre-flight validation for a single operation.
* Returns cost estimation, limits check, and usage information.
*/
export const checkPreflight = async (
operation: PreflightOperation
): Promise<PreflightCheckResponse> => {
try {
const response = await billingAPI.post<{ success: boolean; data: PreflightCheckResponse }>(
'/preflight-check',
{
operations: [operation]
}
);
if (!response.data.success) {
throw new Error('Pre-flight check failed');
}
return response.data.data;
} catch (error: any) {
console.error('[BillingService] Pre-flight check error:', error);
// Return a safe default response on error
return {
can_proceed: false,
estimated_cost: 0,
operations: [{
provider: operation.provider,
operation_type: operation.operation_type,
cost: 0,
allowed: false,
limit_info: null,
message: error?.response?.data?.detail || 'Pre-flight check failed'
}],
total_cost: 0,
usage_summary: null,
cached: false
};
}
};
/**
* Check pre-flight validation for multiple operations in a single request.
* Useful for pages with many buttons to reduce API calls.
*/
export const checkPreflightBatch = async (
operations: PreflightOperation[]
): Promise<PreflightCheckResponse> => {
try {
const response = await billingAPI.post<{ success: boolean; data: PreflightCheckResponse }>(
'/preflight-check',
{
operations
}
);
if (!response.data.success) {
throw new Error('Pre-flight check failed');
}
return response.data.data;
} catch (error: any) {
console.error('[BillingService] Pre-flight batch check error:', error);
// Return a safe default response on error
return {
can_proceed: false,
estimated_cost: 0,
operations: operations.map(op => ({
provider: op.provider,
operation_type: op.operation_type,
cost: 0,
allowed: false,
limit_info: null,
message: error?.response?.data?.detail || 'Pre-flight check failed'
})),
total_cost: 0,
usage_summary: null,
cached: false
};
}
};
export const formatNumber = (num: number): string => {
return new Intl.NumberFormat('en-US').format(num);
};

View File

@@ -204,8 +204,10 @@ export interface StoryAudioGenerationResponse {
export interface StoryVideoGenerationRequest {
scenes: StoryScene[];
image_urls: string[];
image_urls: (string | null)[];
audio_urls: string[];
video_urls?: (string | null)[] | null;
ai_audio_urls?: (string | null)[] | null;
story_title?: string;
fps?: number;
transition_duration?: number;
@@ -227,6 +229,38 @@ export interface StoryVideoGenerationResponse {
task_id?: string;
}
export interface AnimateSceneRequest {
scene_number: number;
scene_data: StoryScene;
story_context: Record<string, any>;
image_url: string;
duration?: 5 | 10;
}
export interface AnimateSceneVoiceoverRequest extends AnimateSceneRequest {
audio_url: string;
resolution?: '480p' | '720p';
prompt?: string;
}
export interface AnimateSceneResponse {
success: boolean;
scene_number: number;
video_filename: string;
video_url: string;
duration: number;
cost: number;
prompt_used: string;
provider: string;
prediction_id?: string;
}
export interface ResumeAnimateSceneRequest {
prediction_id: string;
scene_number: number;
duration?: 5 | 10;
}
class StoryWriterApi {
/**
* Generate 3 story setup options from a user's story idea
@@ -373,20 +407,63 @@ class StoryWriterApi {
return response.data;
}
/**
* Animate a single scene image into a short video preview
*/
async animateScene(request: AnimateSceneRequest): Promise<AnimateSceneResponse> {
const response = await aiApiClient.post<AnimateSceneResponse>(
"/api/story/animate-scene-preview",
request
);
return response.data;
}
/**
* Animate a scene image using WaveSpeed InfiniteTalk with voiceover (async)
* Returns task_id for polling since InfiniteTalk can take up to 10 minutes.
*/
async animateSceneVoiceover(request: AnimateSceneVoiceoverRequest): Promise<{ task_id: string; status: string; message: string }> {
const response = await aiApiClient.post<{ task_id: string; status: string; message: string }>(
"/api/story/animate-scene-voiceover",
request
);
return response.data;
}
/**
* Resume a timed-out scene animation download using the prediction id
*/
async resumeAnimateScene(request: ResumeAnimateSceneRequest): Promise<AnimateSceneResponse> {
const response = await aiApiClient.post<AnimateSceneResponse>(
"/api/story/animate-scene-resume",
request
);
return response.data;
}
private buildAbsoluteUrl(path: string): string {
if (!path) return path;
if (path.startsWith('http://') || path.startsWith('https://')) {
return path;
}
const baseURL = aiApiClient.defaults.baseURL || '';
const cleanBaseURL = baseURL.endsWith('/') ? baseURL.slice(0, -1) : baseURL;
const cleanPath = path.startsWith('/') ? path : `/${path}`;
return `${cleanBaseURL}${cleanPath}`;
}
/**
* Get image URL for a scene image
*/
getImageUrl(imageUrl: string): string {
// If imageUrl is already a full URL, return it as-is
if (imageUrl.startsWith('http://') || imageUrl.startsWith('https://')) {
return imageUrl;
return this.buildAbsoluteUrl(imageUrl);
}
// Otherwise, prepend the base URL
const baseURL = aiApiClient.defaults.baseURL || '';
// Remove trailing slash from baseURL if present, and leading slash from imageUrl if present
const cleanBaseURL = baseURL.endsWith('/') ? baseURL.slice(0, -1) : baseURL;
const cleanImageUrl = imageUrl.startsWith('/') ? imageUrl : `/${imageUrl}`;
return `${cleanBaseURL}${cleanImageUrl}`;
/**
* Convert any relative media URL to absolute
*/
getMediaUrl(path: string): string {
return this.buildAbsoluteUrl(path);
}
/**
@@ -400,6 +477,165 @@ class StoryWriterApi {
return response.data;
}
/**
* Optimize an image prompt using WaveSpeed prompt optimizer
*/
async optimizePrompt(request: {
text: string;
mode?: 'image' | 'video';
style?: 'default' | 'artistic' | 'photographic' | 'technical' | 'anime' | 'realistic';
image?: string;
}): Promise<{ optimized_prompt: string; success: boolean }> {
const response = await aiApiClient.post<{ optimized_prompt: string; success: boolean }>(
"/api/story/optimize-prompt",
request
);
return response.data;
}
/**
* Regenerate a scene image using a direct prompt (no AI prompt generation)
*/
async regenerateSceneImage(request: {
scene_number: number;
scene_title: string;
prompt: string;
provider?: string;
width?: number;
height?: number;
model?: string;
}): Promise<{
scene_number: number;
scene_title: string;
image_filename: string;
image_url: string;
width: number;
height: number;
provider: string;
model?: string;
seed?: number;
success: boolean;
error?: string;
}> {
const response = await aiApiClient.post<{
scene_number: number;
scene_title: string;
image_filename: string;
image_url: string;
width: number;
height: number;
provider: string;
model?: string;
seed?: number;
success: boolean;
error?: string;
}>(
"/api/story/regenerate-images",
request
);
return response.data;
}
/**
* Generate AI audio for a single scene using WaveSpeed Minimax Speech 02 HD
*/
async generateAIAudio(request: {
scene_number: number;
scene_title: string;
text: string;
voice_id?: string;
speed?: number;
volume?: number;
pitch?: number;
emotion?: string;
}): Promise<{
scene_number: number;
scene_title: string;
audio_filename: string;
audio_url: string;
provider: string;
model: string;
voice_id: string;
text_length: number;
file_size: number;
cost: number;
success: boolean;
error?: string;
}> {
const response = await aiApiClient.post<{
scene_number: number;
scene_title: string;
audio_filename: string;
audio_url: string;
provider: string;
model: string;
voice_id: string;
text_length: number;
file_size: number;
cost: number;
success: boolean;
error?: string;
}>(
"/api/story/generate-ai-audio",
request
);
return response.data;
}
/**
* Generate free audio for a single scene using gTTS
*/
async generateFreeAudio(request: {
scene_number: number;
scene_title: string;
text: string;
provider?: string;
lang?: string;
slow?: boolean;
rate?: number;
}): Promise<{
scene_number: number;
scene_title: string;
audio_filename: string;
audio_url: string;
provider: string;
file_size: number;
success: boolean;
error?: string;
}> {
// Use existing generateSceneAudio endpoint but for a single scene
const response = await aiApiClient.post<StoryAudioGenerationResponse>(
"/api/story/generate-audio",
{
scenes: [{
scene_number: request.scene_number,
title: request.scene_title,
audio_narration: request.text,
}],
provider: request.provider || 'gtts',
lang: request.lang || 'en',
slow: request.slow || false,
rate: request.rate || 150,
}
);
const result = response.data;
if (result.success && result.audio_files && result.audio_files.length > 0) {
const audio = result.audio_files[0];
return {
scene_number: audio.scene_number,
scene_title: audio.scene_title,
audio_filename: audio.audio_filename,
audio_url: audio.audio_url,
provider: audio.provider,
file_size: audio.file_size,
success: true,
error: audio.error,
};
} else {
throw new Error(result.audio_files?.[0]?.error || 'Failed to generate audio');
}
}
/**
* Get audio URL for a scene audio file
*/
@@ -496,7 +732,6 @@ class StoryWriterApi {
scene_data: StoryScene;
story_context: Record<string, any>;
all_scenes: StoryScene[];
scene_image_url?: string;
provider?: string;
model?: string;
num_frames?: number;

View File

@@ -1,9 +1,19 @@
import { aiApiClient } from "../api/client";
export async function fetchMediaBlobUrl(pathOrUrl: string): Promise<string> {
const rel = pathOrUrl.startsWith("/") ? pathOrUrl : `/${pathOrUrl}`;
const res = await aiApiClient.get(rel, { responseType: "blob" });
return URL.createObjectURL(res.data);
export async function fetchMediaBlobUrl(pathOrUrl: string): Promise<string | null> {
try {
const rel = pathOrUrl.startsWith("/") ? pathOrUrl : `/${pathOrUrl}`;
const res = await aiApiClient.get(rel, { responseType: "blob" });
return URL.createObjectURL(res.data);
} catch (err: any) {
// Gracefully handle 404s and other errors - file might not exist or was regenerated
if (err?.response?.status === 404) {
console.warn(`Media file not found (404): ${pathOrUrl}`);
return null;
}
// Re-throw other errors
throw err;
}
}

View File

@@ -0,0 +1,490 @@
# Pre-flight Check with Cost Estimation and Button Enhancement Plan
## Overview
Implement a reusable pre-flight check system that shows estimated costs on buttons and validates operations on hover. This will provide users with cost transparency and prevent unnecessary API calls by showing if operations are allowed before execution.
## Goals
1. Show estimated cost on buttons (e.g., "Generate HD Video $0.21")
2. Perform pre-flight check on hover (debounced to avoid performance issues)
3. Show detailed information (allowed/blocked, limits, remaining quota)
4. Disable buttons with appropriate messaging if limits exceeded
5. Common/reusable solution across all ALwrity tools (blog writer, story, linkedin, etc.)
6. Performance optimized (caching, debouncing, batching)
7. Foundation for billing dashboard insights about operation costs
## Current State Analysis
### Backend Existing Capabilities
- **Pre-flight validation**: `preflight_validator.py` has functions like `validate_video_generation_operations`, `validate_image_generation_operations`
- **Limit checking**: `pricing_service.py` has `check_comprehensive_limits()` and `check_usage_limits()`
- **Pricing lookup**: `get_pricing_for_provider_model()` returns cost information
- **Caching**: `_limits_cache` with TTL to reduce DB reads
- **Operation validation**: Supports multi-operation workflows with token estimation
### Frontend Existing Capabilities
- **Billing service**: `billingService.ts` has API client for subscription endpoints
- **Subscription hooks**: `useSubscriptionGuard`, `useSubscription` for subscription state
- **Button components**: Various buttons but no cost/pre-flight integration
- **Usage dashboard**: Shows usage but not per-operation costs
### Gaps
- No lightweight endpoint for cost estimation + pre-flight check
- No reusable button component with cost/pre-flight integration
- No debouncing/throttling for hover-based checks
- No consistent UX pattern across tools
## Implementation Plan
### Phase 1: Backend API Endpoint
#### 1.1 Create Pre-flight Check Endpoint
**File**: `backend/api/subscription_api.py`
**Endpoint**: `POST /api/subscription/preflight-check`
**Purpose**: Lightweight endpoint that:
- Accepts operation definition (provider, model, tokens_estimated, operation_type)
- Returns cost estimation, limits check result, usage info
- Uses caching to minimize DB load
- Fast response (< 100ms with cache hit)
**Request Format**:
```json
{
"operations": [
{
"provider": "video",
"model": "tencent/HunyuanVideo",
"tokens_requested": 0,
"operation_type": "video_generation",
"actual_provider_name": "huggingface"
}
]
}
```
**Response Format**:
```json
{
"success": true,
"data": {
"can_proceed": true,
"estimated_cost": 0.21,
"operations": [
{
"provider": "video",
"operation_type": "video_generation",
"cost": 0.21,
"allowed": true,
"limit_info": {
"current_usage": 5,
"limit": 100,
"remaining": 95
},
"message": null
}
],
"total_cost": 0.21,
"usage_summary": {
"current_calls": 5,
"limit": 100,
"remaining": 95
},
"cached": false
}
}
```
**Implementation Details**:
- Use `PricingService.check_comprehensive_limits()` for validation
- Use `PricingService.get_pricing_for_provider_model()` for cost
- Leverage existing `_limits_cache` (5-second TTL)
- Return structured error if blocked with user-friendly message
#### 1.2 Batch Pre-flight Check Endpoint (Optional, for performance)
**Endpoint**: `POST /api/subscription/preflight-check-batch`
**Purpose**: Check multiple operations at once for pages with many buttons
**Performance Considerations**:
- Single DB query for all operations
- Batch cache lookups
- Return results in order matching request
### Phase 2: Frontend Service Layer
#### 2.1 Extend Billing Service
**File**: `frontend/src/services/billingService.ts`
**New Functions**:
```typescript
interface PreflightOperation {
provider: string;
model?: string;
tokens_requested?: number;
operation_type: string;
actual_provider_name?: string;
}
interface PreflightCheckResponse {
can_proceed: boolean;
estimated_cost: number;
operations: Array<{
provider: string;
operation_type: string;
cost: number;
allowed: boolean;
limit_info: {
current_usage: number;
limit: number;
remaining: number;
};
message: string | null;
}>;
total_cost: number;
usage_summary: {
current_calls: number;
limit: number;
remaining: number;
};
cached: boolean;
}
// Single operation check
export const checkPreflight = async (
operation: PreflightOperation
): Promise<PreflightCheckResponse>
// Batch operations check (for pages with many buttons)
export const checkPreflightBatch = async (
operations: PreflightOperation[]
): Promise<PreflightCheckResponse>
```
**Implementation Details**:
- Use axios with request cancellation support
- Add request debouncing wrapper
- Handle errors gracefully (show cached result if available)
- Return structured error messages for UI display
#### 2.2 Create Pre-flight Check Hook
**File**: `frontend/src/hooks/usePreflightCheck.ts`
**Purpose**: Reusable React hook that:
- Manages pre-flight check state (loading, error, result)
- Debounces hover events (300ms delay)
- Caches results per operation (5-second TTL)
- Provides easy-to-use API for components
**API**:
```typescript
interface UsePreflightCheckOptions {
operation: PreflightOperation;
enabled?: boolean; // Whether to perform check on hover
debounceMs?: number; // Debounce delay (default: 300ms)
cacheTtl?: number; // Cache TTL in ms (default: 5000ms)
}
interface UsePreflightCheckResult {
canProceed: boolean;
estimatedCost: number;
limitInfo: {
current: number;
limit: number;
remaining: number;
} | null;
loading: boolean;
error: string | null;
checkOnHover: () => void;
checkNow: () => void; // Immediate check
reset: () => void;
}
export const usePreflightCheck = (
options: UsePreflightCheckOptions
): UsePreflightCheckResult
```
**Implementation Details**:
- Use `useState` for state management
- Use `useCallback` for memoized handlers
- Use `useRef` for debounce timers and cache
- Implement request cancellation on unmount
### Phase 3: Reusable Button Component
#### 3.1 Create Enhanced Operation Button Component
**File**: `frontend/src/components/shared/OperationButton.tsx`
**Purpose**: Reusable button component that:
- Shows estimated cost in button label
- Performs pre-flight check on hover
- Shows detailed tooltip with limits/remaining quota
- Disables button with messaging if blocked
- Supports all operation types (video, image, image_edit, text generation, etc.)
**Props**:
```typescript
interface OperationButtonProps {
// Operation definition
operation: PreflightOperation;
// Button configuration
label: string; // Base label (e.g., "Generate HD Video")
variant?: 'contained' | 'outlined' | 'text';
size?: 'small' | 'medium' | 'large';
color?: 'primary' | 'secondary' | 'success' | 'error';
startIcon?: React.ReactNode;
endIcon?: React.ReactNode;
// Pre-flight check behavior
showCost?: boolean; // Show cost in label (default: true)
checkOnHover?: boolean; // Check on hover (default: true)
checkOnMount?: boolean; // Check on mount (default: false)
// Callbacks
onClick: () => void;
onPreflightResult?: (result: PreflightCheckResponse) => void;
// Customization
disabled?: boolean; // Additional disabled state
loading?: boolean; // Loading state override
tooltipPlacement?: 'top' | 'bottom' | 'left' | 'right';
// Styling
sx?: SxProps<Theme>;
fullWidth?: boolean;
}
```
**Features**:
- Cost display: "Generate HD Video $0.21" or "Generate HD Video" if cost unavailable
- Tooltip on hover shows:
- Operation allowed/blocked status
- Current usage / limit / remaining
- Estimated cost breakdown
- Message if blocked (e.g., "You've reached your video generation limit. Upgrade your plan for more videos.")
- Button disabled if:
- `disabled` prop is true
- `loading` prop is true
- Pre-flight check returned `can_proceed: false`
- Button styling:
- Normal: standard button
- Blocked: grayed out with warning icon
- Loading: spinner with disabled state
**Implementation Details**:
- Use Material-UI `Button` and `Tooltip` components
- Integrate with `usePreflightCheck` hook
- Format cost as currency (e.g., "$0.21" or "$0.00" if free)
- Handle edge cases (no subscription, no limits, etc.)
#### 3.2 Create Operation Type Mappings
**File**: `frontend/src/utils/operationTypes.ts`
**Purpose**: Centralized configuration for operation types:
- Default models per operation type
- Display names
- Icons
- Default token estimates
```typescript
export const OPERATION_TYPES = {
video_generation: {
provider: 'video',
defaultModel: 'tencent/HunyuanVideo',
displayName: 'Video Generation',
icon: VideoLibraryIcon,
defaultTokens: 0,
},
image_generation: {
provider: 'stability',
defaultModel: 'stability-ai/stable-diffusion-xl',
displayName: 'Image Generation',
icon: ImageIcon,
defaultTokens: 0,
},
image_editing: {
provider: 'image_edit',
defaultModel: 'Qwen/Qwen-Image-Edit',
displayName: 'Image Editing',
icon: EditIcon,
defaultTokens: 0,
},
// ... more operation types
} as const;
```
### Phase 4: Integration Across Tools
#### 4.1 Story Writer Integration
**Files**:
- `frontend/src/components/StoryWriter/components/HdVideoSection.tsx`
- `frontend/src/components/StoryWriter/components/VideoSection.tsx`
- `frontend/src/components/StoryWriter/components/MultimediaToolbar.tsx`
**Changes**:
- Replace existing buttons with `OperationButton`
- Configure with appropriate operation type
- Pass existing `onClick` handlers
**Example**:
```tsx
<OperationButton
operation={{
provider: 'video',
model: 'tencent/HunyuanVideo',
tokens_requested: 0,
operation_type: 'video_generation',
actual_provider_name: 'huggingface',
}}
label="Generate HD Animation"
showCost={true}
checkOnHover={true}
onClick={handleGenerateHdVideo}
disabled={isGeneratingHdVideo || state.hdVideoGenerationStatus === 'awaiting_approval'}
loading={isGeneratingHdVideo}
/>
```
#### 4.2 Blog Writer Integration
**Files**: Various blog writer components with generation buttons
**Changes**: Similar to Story Writer - replace buttons with `OperationButton`
#### 4.3 LinkedIn Writer Integration
**Files**: LinkedIn writer components
**Changes**: Similar pattern
### Phase 5: Performance Optimization
#### 5.1 Caching Strategy
**Backend**:
- Use existing `_limits_cache` (5-second TTL)
- Cache pre-flight check results per user:operation combination
- Invalidate cache on usage updates
**Frontend**:
- In-memory cache per hook instance (5-second TTL)
- Share cache across components using React Context
- Clear cache on subscription changes
#### 5.2 Debouncing/Throttling
**Frontend**:
- Debounce hover events (300ms delay)
- Throttle batch requests (max 1 request per 500ms)
- Cancel in-flight requests on unmount/hover exit
#### 5.3 Request Batching
**Frontend**:
- For pages with many buttons (e.g., story export with multiple operations)
- Batch multiple operations into single request
- Use `checkPreflightBatch` API
#### 5.4 Lazy Loading
**Frontend**:
- Only check on hover (not on mount)
- Optional: Check on mount for primary buttons only
- Defer checks for secondary/tertiary buttons
### Phase 6: Billing Dashboard Integration (Future)
#### 6.1 Operation Cost Tracking
**Backend**:
- Track operation costs in `APIUsageLog` (already exists)
- Add operation_type field to logs (already exists)
#### 6.2 Cost Insights
**Frontend**:
- Add operation cost breakdown to billing dashboard
- Show most expensive operations
- Show cost trends per operation type
- Add filters by operation type
## Performance Considerations
### Potential Bottlenecks
1. **Many buttons on one page**: Each button hovering could trigger requests
- **Solution**: Batch requests, debounce, cache aggressively
2. **Rapid hover in/out**: Multiple requests for same operation
- **Solution**: Debounce (300ms), cancel in-flight requests
3. **Backend DB load**: Each check queries subscription/usage tables
- **Solution**: Use existing cache (5-second TTL), optimize queries
4. **Frontend render performance**: Many tooltips updating
- **Solution**: Virtualize if needed, optimize re-renders with React.memo
### Performance Targets
- Pre-flight check API: < 100ms with cache hit, < 300ms without cache
- Frontend hover response: < 50ms (debounced)
- Batch check (10 operations): < 500ms
- Tooltip render: < 16ms (60fps)
## Testing Strategy
### Unit Tests
- `usePreflightCheck` hook: debouncing, caching, error handling
- `OperationButton` component: cost display, tooltip, disabled states
- Billing service: API calls, error handling
### Integration Tests
- Pre-flight check endpoint: validation, cost calculation, caching
- Button hover behavior: tooltip display, disabled states
### E2E Tests
- User hovers over button, sees cost and limits
- User blocked by limits, sees appropriate messaging
- User clicks button, operation executes (or fails with clear error)
## Migration Strategy
### Phase 1: Backend (Week 1)
1. Create pre-flight check endpoint
2. Add unit tests
3. Deploy and monitor performance
### Phase 2: Frontend Core (Week 2)
1. Extend billing service
2. Create `usePreflightCheck` hook
3. Create `OperationButton` component
4. Add unit tests
### Phase 3: Integration (Week 3)
1. Integrate into Story Writer (highest priority - most buttons)
2. Test thoroughly
3. Iterate based on feedback
### Phase 4: Rollout (Week 4+)
1. Integrate into Blog Writer
2. Integrate into LinkedIn Writer
3. Integrate into other tools
4. Monitor performance and user feedback
## Success Metrics
1. **User Experience**:
- Reduced confusion about operation costs
- Fewer failed operations due to limits
- Increased clarity about remaining quota
2. **Performance**:
- < 100ms API response time (with cache)
- < 1% increase in backend DB load
- No noticeable UI lag on pages with many buttons
3. **Adoption**:
- All major operation buttons using new component
- Consistent UX across all tools
## Future Enhancements
1. **Cost estimation for multi-operation workflows**: Estimate total cost for complex operations
2. **Usage predictions**: Show projected usage if user continues current pattern
3. **Cost optimization suggestions**: Suggest cheaper alternatives
4. **Batch operation approval**: Show total cost and allow approval for multiple operations
5. **Cost alerts**: Warn users approaching cost limits
6. **Operation history**: Show recent operations and their costs in tooltip