AI Image and Audio Generation Improvements.

AI Video Generation Pre-Flight Checklist. Cost Estimate Improvements.
2025-12-25 16:26:08 +05:30
parent 59913bffa9
commit 7512933c65
163 changed files with 8938 additions and 37401 deletions
--- a/frontend/src/components/YouTubeCreator/YouTubeCreator.tsx
+++ b/frontend/src/components/YouTubeCreator/YouTubeCreator.tsx
@@ -24,12 +24,16 @@ import { youtubeApi, type VideoPlan, type Scene } from '../../services/youtubeAp
 import { STEPS, YT_RED, YT_BG, YT_BORDER, YT_TEXT, type Resolution, type DurationType, type VideoType } from './constants';
 import { PlanStep } from './components/PlanStep';
 import { ScenesStep } from './components/ScenesStep';
+import { SceneGenerationStep } from './components/SceneGenerationStep';
 import { RenderStep } from './components/RenderStep';
 import { useRenderPolling } from './hooks/useRenderPolling';
 import { useCostEstimate } from './hooks/useCostEstimate';
+import { useImageGenerationPolling } from './hooks/useImageGenerationPolling';
 import HeaderControls from '../shared/HeaderControls';
 import { useYouTubeCreatorState } from '../../hooks/useYouTubeCreatorState';
 import { ContentAsset } from '../../hooks/useContentAssets';
+import { AudioGenerationSettings } from '../../components/shared/AudioSettingsModal';
+import type { YouTubeImageGenerationSettings } from './shared';

 const YouTubeCreator: React.FC = () => {
  const navigate = useNavigate();
@@ -65,6 +69,11 @@ const YouTubeCreator: React.FC = () => {
  const [uploadingAvatar, setUploadingAvatar] = useState(false);
  const [makingPresentable, setMakingPresentable] = useState(false);
  const [regeneratingAvatar, setRegeneratingAvatar] = useState(false);
+  const [generatingImageSceneId, setGeneratingImageSceneId] = useState<number | null>(null);
+  const [generatingAudioSceneId, setGeneratingAudioSceneId] = useState<number | null>(null);
+  
+  // Robust polling hook for image generation
+  const { startPolling: startImagePolling, stopPolling: stopImagePolling } = useImageGenerationPolling();

  // Sync activeStep with persisted state on mount
  useEffect(() => {
@@ -105,6 +114,7 @@ const YouTubeCreator: React.FC = () => {
    scenes,
    resolution,
    renderTaskId,
+    imageModel: 'ideogram-v3-turbo', // Default for now, can be made configurable later
  });

  // Memoized computed values
@@ -312,7 +322,7 @@ const YouTubeCreator: React.FC = () => {

        updateState({ scenes: updatedScenes });
        setSuccess(successMessage);
-        // Navigate immediately to Render step so user can see scenes and cost estimates
+        // Navigate to Scene Generation step (step 2) to generate assets
        setActiveStep(2);
        // Clear success message after a brief moment
        setTimeout(() => {
@@ -391,6 +401,246 @@ const YouTubeCreator: React.FC = () => {
    updateState({ scenes: updatedScenes });
  }, [scenes, updateState]);

+  const handleGenerateSceneImage = useCallback(async (scene: Scene, imageSettings?: YouTubeImageGenerationSettings) => {
+    console.log('[YouTubeCreator] handleGenerateSceneImage called for scene', scene.scene_number);
+    console.log('[YouTubeCreator] This should ONLY be called for image generation, NOT audio generation');
+    
+    // Guard: prevent if already generating image for this scene
+    if (generatingImageSceneId === scene.scene_number) {
+      console.warn('[YouTubeCreator] Image generation already in progress for this scene');
+      return;
+    }
+    
+    setGeneratingImageSceneId(scene.scene_number);
+    setError(null);
+
+    try {
+      console.log('[YouTubeCreator] Starting image generation task for scene', scene.scene_number);
+
+      const taskResponse = await youtubeApi.generateSceneImage({
+        sceneId: `scene_${scene.scene_number}`,
+        sceneTitle: scene.title,
+        sceneContent: scene.narration,
+        baseAvatarUrl: avatarUrl || undefined,
+        idea: videoPlan?.video_summary || userIdea,
+        width: 1024,
+        height: 576,
+        customPrompt: imageSettings?.prompt,
+        style: imageSettings?.style,
+        renderingSpeed: imageSettings?.renderingSpeed,
+        aspectRatio: imageSettings?.aspectRatio,
+        model: imageSettings?.model,
+      });
+
+      console.log('[YouTubeCreator] Image generation task started:', taskResponse);
+
+      if (!taskResponse.success) {
+        throw new Error(taskResponse.message || 'Failed to start image generation task');
+      }
+
+      const taskId = taskResponse.task_id;
+
+      // Start robust polling
+      startImagePolling({
+        taskId,
+        sceneNumber: scene.scene_number,
+        getStatus: youtubeApi.getImageGenerationStatus,
+        onComplete: (imageUrl) => {
+          console.log('[YouTubeCreator] Image generation completed!', {
+            sceneNumber: scene.scene_number,
+            imageUrl,
+          });
+
+          // Update scene with image URL atomically
+          const updatedScenes = scenes.map(s =>
+            s.scene_number === scene.scene_number
+              ? { ...s, imageUrl }
+              : s
+          );
+          updateState({ scenes: updatedScenes });
+
+          setSuccess(`Image generated for Scene ${scene.scene_number}!`);
+          setTimeout(() => setSuccess(null), 3000);
+          setGeneratingImageSceneId(null);
+        },
+        onError: (errorMsg) => {
+          setError(errorMsg);
+          setGeneratingImageSceneId(null);
+        },
+        onProgress: (progress, message) => {
+          console.log(`[YouTubeCreator] Image generation in progress: ${progress}% - ${message}`);
+        },
+      });
+
+    } catch (err: any) {
+      const errorMessage = err?.response?.data?.detail?.message
+        || err?.response?.data?.detail?.error
+        || err?.response?.data?.detail
+        || err?.message
+        || 'Failed to start image generation';
+      setError(`Scene ${scene.scene_number}: ${errorMessage}`);
+      setGeneratingImageSceneId(null);
+      throw err; // Re-throw so SceneCard can handle it
+    }
+  }, [scenes, avatarUrl, videoPlan, userIdea, updateState, generatingImageSceneId, startImagePolling]);
+
+  // Helper function to build enriched text for better audio generation
+  const buildEnrichedSceneText = (scene: Scene): string => {
+    // Start with the core narration text
+    let enrichedText = scene.narration;
+
+    // Add scene title for context (helps WaveSpeed understand the scene's purpose)
+    if (scene.title && scene.title !== scene.narration.substring(0, scene.title.length)) {
+      enrichedText = `${scene.title}. ${enrichedText}`;
+    }
+
+    // Add delivery style hints based on emphasis tags
+    if (scene.emphasis_tags && scene.emphasis_tags.length > 0) {
+      const deliveryHints = scene.emphasis_tags.map(tag => {
+        switch (tag) {
+          case 'hook': return 'speak with energy and excitement';
+          case 'cta': return 'speak persuasively and confidently';
+          case 'transition': return 'speak smoothly and clearly';
+          default: return 'speak professionally and clearly';
+        }
+      });
+
+      // Use the primary emphasis tag for the delivery hint
+      const primaryHint = deliveryHints[0];
+      enrichedText += ` [${primaryHint}]`;
+    }
+
+    // Add visual cues for emotional delivery guidance
+    if (scene.visual_cues && scene.visual_cues.length > 0) {
+      // Filter for cues that affect audio delivery
+      const audioRelevantCues = scene.visual_cues.filter(cue =>
+        cue.toLowerCase().includes('slow') ||
+        cue.toLowerCase().includes('fast') ||
+        cue.toLowerCase().includes('energetic') ||
+        cue.toLowerCase().includes('calm') ||
+        cue.toLowerCase().includes('dramatic') ||
+        cue.toLowerCase().includes('intense')
+      );
+
+      if (audioRelevantCues.length > 0) {
+        enrichedText += ` [Pacing: ${audioRelevantCues.join(', ')}]`;
+      }
+    }
+
+    // Add duration estimate for natural pacing
+    if (scene.duration_estimate && scene.duration_estimate > 0) {
+      const wordsPerMinute = enrichedText.split(' ').length / (scene.duration_estimate / 60);
+      if (wordsPerMinute > 200) {
+        enrichedText += ` [Speak at a natural, conversational pace]`;
+      } else if (wordsPerMinute < 120) {
+        enrichedText += ` [Take time to articulate clearly]`;
+      }
+    }
+
+    // Ensure we don't exceed WaveSpeed's 10,000 character limit
+    if (enrichedText.length > 9500) {
+      enrichedText = enrichedText.substring(0, 9500) + '...';
+    }
+
+    return enrichedText;
+  };
+
+  const handleGenerateSceneAudio = useCallback(async (scene: Scene, audioSettings?: AudioGenerationSettings) => {
+    console.log('[YouTubeCreator] handleGenerateSceneAudio called for scene', scene.scene_number);
+    console.log('[YouTubeCreator] This should ONLY be called for audio generation, NOT image generation');
+
+    // Guard: prevent if already generating audio for this scene
+    if (generatingAudioSceneId === scene.scene_number) {
+      console.warn('[YouTubeCreator] Audio generation already in progress for this scene');
+      return;
+    }
+
+    setGeneratingAudioSceneId(scene.scene_number);
+    setError(null);
+
+    try {
+      // Enhanced audio defaults optimized for YouTube content
+      // Based on research into natural speech patterns and user feedback
+      // Speed 1.08: Natural conversational pace (engaging but not rushed)
+      // Voice: Auto-selected based on content analysis
+      // Emotion: Auto-selected based on scene content
+      // High quality settings for professional YouTube audio
+      const settings: AudioGenerationSettings = audioSettings || {
+        voiceId: "", // Empty string triggers auto-selection by backend
+        speed: 1.08, // Natural conversational pace - engaging but comfortable
+        volume: 1.0, // Standard volume
+        pitch: 0.0, // Neutral pitch for natural sound
+        emotion: "happy", // Default emotion (backend will auto-select based on content)
+        englishNormalization: true, // Better handling of numbers, dates, and technical terms
+        sampleRate: 44100, // CD quality audio
+        bitrate: 256000, // Highest quality: 256kbps for professional audio
+        channel: "2" as const, // Stereo for richer audio experience
+        format: "mp3" as const, // Universal format
+        languageBoost: "English", // Optimize for English content
+        enableSyncMode: true, // Reliable delivery
+      };
+
+      // Build enriched text for better audio generation
+      const enrichedText = buildEnrichedSceneText(scene);
+
+      console.log('[YouTubeCreator] Calling youtubeApi.generateSceneAudio with enriched text:', {
+        sceneId: `scene_${scene.scene_number}`,
+        sceneTitle: scene.title,
+        originalTextLength: scene.narration?.length,
+        enrichedTextLength: enrichedText.length,
+        voiceId: settings.voiceId || undefined, // Will auto-select if empty
+        endpoint: '/api/youtube/audio',
+        settings: settings,
+        video_plan_context: {
+          video_type: videoType,
+          target_audience: targetAudience,
+          tone: videoPlan?.tone,
+          visual_style: videoPlan?.visual_style,
+          video_goal: videoPlan?.video_goal,
+        },
+      });
+
+      const result = await youtubeApi.generateSceneAudio({
+        sceneId: `scene_${scene.scene_number}`,
+        sceneTitle: scene.title,
+        text: enrichedText, // Send enriched text instead of just narration
+        voiceId: settings.voiceId || undefined, // Will auto-select if empty
+        speed: settings.speed,
+        volume: settings.volume,
+        pitch: settings.pitch,
+        emotion: settings.emotion,
+        englishNormalization: settings.englishNormalization,
+        sampleRate: settings.sampleRate,
+        bitrate: settings.bitrate,
+        channel: settings.channel,
+        format: settings.format,
+        languageBoost: settings.languageBoost,
+        enableSyncMode: settings.enableSyncMode,
+      });
+
+      console.log('[YouTubeCreator] Audio generation result:', result);
+
+      // Update scene with audio URL
+      const updatedScenes = scenes.map(s =>
+        s.scene_number === scene.scene_number
+          ? { ...s, audioUrl: result.audio_url }
+          : s
+      );
+      updateState({ scenes: updatedScenes });
+      setSuccess(`Audio generated for Scene ${scene.scene_number}!`);
+    } catch (err: any) {
+      const errorMessage = err?.response?.data?.detail?.message 
+        || err?.response?.data?.detail?.error 
+        || err?.response?.data?.detail 
+        || err?.message 
+        || 'Failed to generate audio';
+      setError(errorMessage);
+      throw err; // Re-throw so SceneCard can handle it
+    } finally {
+      setGeneratingAudioSceneId(null);
+    }
+  }, [scenes, updateState]);
+
  const handleStartRender = useCallback(async () => {
    if (scenes.length === 0) {
      setError('Please build scenes first');
@@ -408,6 +658,19 @@ const YouTubeCreator: React.FC = () => {
      return;
    }

+    // VALIDATION: Check that all enabled scenes have both image and audio
+    const scenesMissingAssets = enabledScenes.filter(s => !s.imageUrl || !s.audioUrl);
+    if (scenesMissingAssets.length > 0) {
+      const missingList = scenesMissingAssets.map(s => {
+        const missing = [];
+        if (!s.imageUrl) missing.push('image');
+        if (!s.audioUrl) missing.push('audio');
+        return `Scene ${s.scene_number} (missing: ${missing.join(', ')})`;
+      }).join(', ');
+      setError(`Please generate images and audio for all enabled scenes before rendering. Missing: ${missingList}`);
+      return;
+    }
+
    setLoading(true);
    setError(null);
    setSuccess(null);
@@ -472,17 +735,37 @@ const YouTubeCreator: React.FC = () => {
        return;
      }
      if (scenes.length === 0) {
-        setError('Please build scenes before rendering.');
+        setError('Please build scenes first.');
+        return;
+      }
+      setActiveStep(2);
+      return;
+    }
+
+    if (targetStep === 3) {
+      if (!videoPlan) {
+        setError('Please generate a plan first.');
+        return;
+      }
+      if (scenes.length === 0) {
+        setError('Please build scenes first.');
        return;
      }
      if (enabledScenesCount === 0) {
        setError('Enable at least one scene to render.');
        return;
      }
-      setActiveStep(2);
+      // Check if all enabled scenes have assets
+      const enabledScenes = scenes.filter(s => s.enabled !== false);
+      const allReady = enabledScenes.every(s => s.imageUrl && s.audioUrl);
+      if (!allReady) {
+        setError('Please generate images and audio for all enabled scenes first.');
+        return;
+      }
+      setActiveStep(3);
      return;
    }
-  }, [activeStep, videoPlan, scenes.length, enabledScenesCount]);
+  }, [activeStep, videoPlan, scenes, enabledScenesCount]);

  const handleResetRender = useCallback(() => {
    updateState({
@@ -637,6 +920,29 @@ const YouTubeCreator: React.FC = () => {
      )}

      {activeStep === 2 && (
+        <SceneGenerationStep
+          scenes={scenes}
+          videoPlan={videoPlan}
+          editingSceneId={editingSceneId}
+          editedScene={editedScene}
+          onEditScene={handleEditScene}
+          onSaveScene={handleSaveScene}
+          onCancelEdit={handleCancelEdit}
+          onEditChange={handleEditChange}
+          onToggleScene={handleToggleScene}
+          onGenerateImage={handleGenerateSceneImage}
+          generatingImageSceneId={generatingImageSceneId}
+          onGenerateAudio={handleGenerateSceneAudio}
+          generatingAudioSceneId={generatingAudioSceneId}
+          loading={loading}
+          avatarUrl={avatarUrl}
+          videoPlanIdea={videoPlan?.video_summary || userIdea}
+          onBack={() => setActiveStep(1)}
+          onNext={() => setActiveStep(3)}
+        />
+      )}
+
+      {activeStep === 3 && (
        <RenderStep
          renderTaskId={renderTaskId}
          renderStatus={renderStatus}
@@ -649,19 +955,13 @@ const YouTubeCreator: React.FC = () => {
          loading={loading}
          scenes={scenes}
          videoPlan={videoPlan}
-          editingSceneId={editingSceneId}
-          editedScene={editedScene}
          onResolutionChange={(value) => updateState({ resolution: value })}
          onCombineScenesChange={(value) => updateState({ combineScenes: value })}
          onStartRender={handleStartRender}
-          onBack={() => setActiveStep(1)}
+          onBack={() => setActiveStep(2)}
          onReset={handleResetRender}
          onRetryFailedScenes={handleRetryFailedScenes}
-          onEditScene={handleEditScene}
-          onSaveScene={handleSaveScene}
-          onCancelEdit={handleCancelEdit}
-          onEditChange={handleEditChange}
-          onToggleScene={handleToggleScene}
+          onScenesUpdate={(updated) => updateState({ scenes: updated })}
          getVideoUrl={getVideoUrl}
        />
      )}
--- a/frontend/src/components/YouTubeCreator/components/AssetGenerationCostCard.tsx
+++ b/frontend/src/components/YouTubeCreator/components/AssetGenerationCostCard.tsx
@@ -0,0 +1,363 @@
+/**
+ * Asset Generation Cost Card Component
+ * 
+ * Displays cost estimate for generating images and audio for scenes.
+ * Optimized for Step 3 (Generate Assets).
+ */
+
+import React from 'react';
+import {
+  Box,
+  Typography,
+  Stack,
+  Chip,
+  Alert,
+  Divider,
+} from '@mui/material';
+import {
+  MonetizationOn as MoneyIcon,
+  Image as ImageIcon,
+  VolumeUp as AudioIcon,
+  Info as InfoIcon,
+} from '@mui/icons-material';
+import { Scene } from '../../../services/youtubeApi';
+
+interface AssetGenerationCostCardProps {
+  scenes: Scene[];
+}
+
+export const AssetGenerationCostCard: React.FC<AssetGenerationCostCardProps> = React.memo(({
+  scenes,
+}) => {
+  const enabledScenes = scenes.filter(s => s.enabled !== false);
+  const numScenes = enabledScenes.length;
+
+  // Cost per asset (realistic estimates)
+  const costPerImage = 0.10; // Ideogram V3 Turbo default
+  const costPerAudio = 0.05; // Minimax TTS
+
+  // Calculate what's needed
+  const scenesNeedingImages = enabledScenes.filter(s => !s.imageUrl).length;
+  const scenesNeedingAudio = enabledScenes.filter(s => !s.audioUrl).length;
+
+  // Calculate costs
+  const imageCost = scenesNeedingImages * costPerImage;
+  const audioCost = scenesNeedingAudio * costPerAudio;
+  const totalCost = imageCost + audioCost;
+
+  if (numScenes === 0) {
+    return (
+      <Alert severity="warning" sx={{ mt: 2 }}>
+        No enabled scenes to generate assets for.
+      </Alert>
+    );
+  }
+
+  return (
+    <Box
+      sx={{
+        mt: 3,
+        p: 3,
+        background: 'linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%)',
+        borderRadius: 3,
+        border: '2px solid #667eea',
+        boxShadow: '0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06)',
+      }}
+    >
+      {/* Header */}
+      <Box sx={{ display: 'flex', alignItems: 'center', gap: 1.5, mb: 3 }}>
+        <MoneyIcon sx={{ color: '#667eea', fontSize: 28 }} />
+        <Box>
+          <Typography
+            variant="h6"
+            sx={{
+              fontWeight: 700,
+              fontSize: '1.1rem',
+              color: '#1e293b',
+              letterSpacing: '-0.01em',
+            }}
+          >
+            💰 Asset Generation Cost
+          </Typography>
+          <Typography
+            variant="caption"
+            sx={{
+              color: '#64748b',
+              fontSize: '0.75rem',
+            }}
+          >
+            Cost to generate images and audio for your scenes
+          </Typography>
+        </Box>
+      </Box>
+
+      {/* Total Cost Display */}
+      <Box 
+        sx={{ 
+          mb: 3, 
+          p: 2.5,
+          bgcolor: 'white',
+          borderRadius: 2,
+          boxShadow: '0 2px 4px rgba(0, 0, 0, 0.08)',
+        }}
+      >
+        <Typography
+          variant="h3"
+          sx={{
+            fontWeight: 800,
+            fontSize: '2.5rem',
+            color: totalCost === 0 ? '#10b981' : '#667eea',
+            lineHeight: 1.2,
+            mb: 0.5,
+          }}
+        >
+          {totalCost === 0 ? 'FREE!' : `$${totalCost.toFixed(2)}`}
+        </Typography>
+        {totalCost === 0 ? (
+          <Typography
+            variant="body2"
+            sx={{
+              color: '#10b981',
+              fontSize: '0.875rem',
+              fontWeight: 600,
+            }}
+          >
+            ✅ All scenes already have their assets!
+          </Typography>
+        ) : (
+          <Typography
+            variant="body2"
+            sx={{
+              color: '#64748b',
+              fontSize: '0.875rem',
+              fontWeight: 500,
+            }}
+          >
+            To generate missing assets for {scenesNeedingImages + scenesNeedingAudio} item(s)
+          </Typography>
+        )}
+      </Box>
+
+      {/* What's Included Section */}
+      <Box
+        sx={{
+          p: 2.5,
+          bgcolor: 'white',
+          borderRadius: 2,
+          mb: 2.5,
+          boxShadow: '0 1px 3px rgba(0, 0, 0, 0.08)',
+        }}
+      >
+        <Typography
+          variant="subtitle2"
+          sx={{
+            color: '#1e293b',
+            fontWeight: 700,
+            mb: 2,
+            fontSize: '0.95rem',
+            display: 'flex',
+            alignItems: 'center',
+            gap: 1,
+          }}
+        >
+          <InfoIcon sx={{ fontSize: 18, color: '#667eea' }} />
+          What You'll Generate
+        </Typography>
+
+        <Stack spacing={2}>
+          {/* Scene Images */}
+          <Box>
+            <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 0.5 }}>
+              <ImageIcon sx={{ fontSize: 20, color: '#10b981' }} />
+              <Typography
+                variant="body2"
+                sx={{
+                  color: '#1e293b',
+                  fontWeight: 600,
+                  fontSize: '0.875rem',
+                }}
+              >
+                Scene Images
+              </Typography>
+              <Chip 
+                label={scenesNeedingImages === 0 ? 'All Ready' : `${scenesNeedingImages} needed`}
+                size="small"
+                sx={{ 
+                  ml: 'auto',
+                  bgcolor: scenesNeedingImages === 0 ? '#10b981' : '#667eea',
+                  color: 'white',
+                  fontWeight: 600,
+                  fontSize: '0.75rem',
+                }}
+              />
+              {scenesNeedingImages > 0 && (
+                <Chip 
+                  label={`$${imageCost.toFixed(2)}`}
+                  size="small"
+                  sx={{ 
+                    bgcolor: '#667eea',
+                    color: 'white',
+                    fontWeight: 600,
+                    fontSize: '0.75rem',
+                  }}
+                />
+              )}
+            </Box>
+            <Typography
+              variant="body2"
+              sx={{
+                color: '#64748b',
+                fontSize: '0.8rem',
+                lineHeight: 1.5,
+                ml: 3.5,
+              }}
+            >
+              {scenesNeedingImages === 0 ? (
+                <>All {numScenes} scenes already have custom images</>
+              ) : (
+                <>Creating <strong>{scenesNeedingImages} AI-generated images</strong> tailored to your scene content</>
+              )}
+            </Typography>
+            {scenesNeedingImages > 0 && (
+              <Typography
+                variant="caption"
+                sx={{
+                  color: '#94a3b8',
+                  fontSize: '0.7rem',
+                  display: 'block',
+                  ml: 3.5,
+                  mt: 0.5,
+                }}
+              >
+                Rate: ${costPerImage.toFixed(2)}/image • High-quality visuals using Ideogram V3 Turbo
+              </Typography>
+            )}
+          </Box>
+
+          {/* Scene Audio */}
+          {scenesNeedingAudio > 0 || scenesNeedingImages > 0 ? (
+            <>
+              <Divider sx={{ my: 0.5 }} />
+              <Box>
+                <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 0.5 }}>
+                  <AudioIcon sx={{ fontSize: 20, color: '#f59e0b' }} />
+                  <Typography
+                    variant="body2"
+                    sx={{
+                      color: '#1e293b',
+                      fontWeight: 600,
+                      fontSize: '0.875rem',
+                    }}
+                  >
+                    Audio Narration
+                  </Typography>
+                  <Chip 
+                    label={scenesNeedingAudio === 0 ? 'All Ready' : `${scenesNeedingAudio} needed`}
+                    size="small"
+                    sx={{ 
+                      ml: 'auto',
+                      bgcolor: scenesNeedingAudio === 0 ? '#10b981' : '#f59e0b',
+                      color: 'white',
+                      fontWeight: 600,
+                      fontSize: '0.75rem',
+                    }}
+                  />
+                  {scenesNeedingAudio > 0 && (
+                    <Chip 
+                      label={`$${audioCost.toFixed(2)}`}
+                      size="small"
+                      sx={{ 
+                        bgcolor: '#f59e0b',
+                        color: 'white',
+                        fontWeight: 600,
+                        fontSize: '0.75rem',
+                      }}
+                    />
+                  )}
+                </Box>
+                <Typography
+                  variant="body2"
+                  sx={{
+                    color: '#64748b',
+                    fontSize: '0.8rem',
+                    lineHeight: 1.5,
+                    ml: 3.5,
+                  }}
+                >
+                  {scenesNeedingAudio === 0 ? (
+                    <>All {numScenes} scenes already have professional voice narration</>
+                  ) : (
+                    <>Generating <strong>{scenesNeedingAudio} AI voice narrations</strong> from your scene scripts</>
+                  )}
+                </Typography>
+                {scenesNeedingAudio > 0 && (
+                  <Typography
+                    variant="caption"
+                    sx={{
+                      color: '#94a3b8',
+                      fontSize: '0.7rem',
+                      display: 'block',
+                      ml: 3.5,
+                      mt: 0.5,
+                    }}
+                  >
+                    Rate: ${costPerAudio.toFixed(2)}/audio • Natural-sounding voice using Minimax TTS
+                  </Typography>
+                )}
+              </Box>
+            </>
+          ) : null}
+        </Stack>
+
+        {/* Summary Box */}
+        <Box
+          sx={{
+            mt: 2,
+            p: 1.5,
+            bgcolor: '#f1f5f9',
+            borderRadius: 1.5,
+            border: '1px solid #cbd5e1',
+          }}
+        >
+          <Typography
+            variant="caption"
+            sx={{
+              color: '#475569',
+              fontSize: '0.75rem',
+              lineHeight: 1.6,
+              display: 'block',
+            }}
+          >
+            💡 <strong>Smart Generation:</strong> Generate only what you need! If you already have an image or audio for a scene, 
+            we won't charge you to regenerate it unless you explicitly click the regenerate button.
+          </Typography>
+        </Box>
+      </Box>
+
+      {/* Help Section */}
+      <Alert 
+        severity="info" 
+        icon={<InfoIcon />}
+        sx={{ 
+          bgcolor: '#eff6ff',
+          border: '1px solid #bfdbfe',
+          '& .MuiAlert-icon': {
+            color: '#3b82f6',
+          },
+        }}
+      >
+        <Typography variant="body2" sx={{ fontWeight: 600, fontSize: '0.8rem', mb: 0.5 }}>
+          How does this work?
+        </Typography>
+        <Typography variant="caption" sx={{ fontSize: '0.75rem', lineHeight: 1.5, display: 'block' }}>
+          Click "Generate Image" and "Generate Audio" buttons on each scene card. Images use AI to create custom 
+          visuals matching your content, and audio uses text-to-speech to narrate your script naturally. 
+          You only pay for what you generate!
+        </Typography>
+      </Alert>
+    </Box>
+  );
+});
+
+AssetGenerationCostCard.displayName = 'AssetGenerationCostCard';
+
--- a/frontend/src/components/YouTubeCreator/components/AudioSettingsModal.tsx
+++ b/frontend/src/components/YouTubeCreator/components/AudioSettingsModal.tsx
@@ -0,0 +1,512 @@
+import React, { useEffect, useState } from "react";
+import {
+  Dialog,
+  DialogTitle,
+  DialogContent,
+  DialogActions,
+  Stack,
+  Box,
+  Typography,
+  Slider,
+  Select,
+  MenuItem,
+  FormControl,
+  InputLabel,
+  FormControlLabel,
+  Checkbox,
+  Tooltip,
+  IconButton,
+  alpha,
+  TextField,
+} from "@mui/material";
+import { HelpOutline as HelpOutlineIcon, Close as CloseIcon, VolumeUp } from "@mui/icons-material";
+import { Button } from "@mui/material";
+
+export type YouTubeAudioGenerationSettings = {
+  voiceId: string;
+  speed: number;
+  volume: number;
+  pitch: number;
+  emotion: string;
+  englishNormalization: boolean;
+  sampleRate?: number;
+  bitrate: number;
+  channel: "1" | "2";
+  format: "mp3" | "wav" | "pcm" | "flac";
+  languageBoost?: string;
+  enableSyncMode: boolean;
+};
+
+interface AudioSettingsModalProps {
+  open: boolean;
+  onClose: () => void;
+  onApplySettings: (settings: YouTubeAudioGenerationSettings) => void;
+  initialSettings: YouTubeAudioGenerationSettings;
+  isGenerating?: boolean;
+  sceneTitle?: string;
+}
+
+// Voice options from minimax/speech-02-hd
+const VOICE_OPTIONS = [
+  "Wise_Woman",
+  "Friendly_Person",
+  "Inspirational_girl",
+  "Deep_Voice_Man",
+  "Calm_Woman",
+  "Casual_Guy",
+  "Lively_Girl",
+  "Patient_Man",
+  "Young_Knight",
+  "Determined_Man",
+  "Lovely_Girl",
+  "Decent_Boy",
+  "Imposing_Manner",
+  "Elegant_Man",
+  "Abbess",
+  "Sweet_Girl_2",
+  "Exuberant_Girl",
+];
+
+const EMOTION_OPTIONS = ["happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral"];
+
+const SAMPLE_RATE_OPTIONS = [8000, 16000, 22050, 24000, 32000, 44100];
+const BITRATE_OPTIONS = [32000, 64000, 128000, 256000];
+const LANGUAGE_BOOST_OPTIONS = [
+  "auto",
+  "English",
+  "Chinese",
+  "Chinese,Yue",
+  "Arabic",
+  "Russian",
+  "Spanish",
+  "French",
+  "Portuguese",
+  "German",
+  "Turkish",
+  "Dutch",
+  "Ukrainian",
+  "Vietnamese",
+  "Indonesian",
+  "Japanese",
+  "Italian",
+  "Korean",
+  "Thai",
+  "Polish",
+  "Romanian",
+  "Greek",
+  "Czech",
+  "Finnish",
+  "Hindi",
+];
+
+export const AudioSettingsModal: React.FC<AudioSettingsModalProps> = ({
+  open,
+  onClose,
+  onApplySettings,
+  initialSettings,
+  isGenerating = false,
+  sceneTitle,
+}) => {
+  const [settings, setSettings] = useState<YouTubeAudioGenerationSettings>(initialSettings);
+
+  useEffect(() => {
+    setSettings(initialSettings);
+  }, [initialSettings]);
+
+  const handleApply = () => {
+    onApplySettings(settings);
+  };
+
+  return (
+    <Dialog
+      open={open}
+      onClose={onClose}
+      maxWidth="md"
+      fullWidth
+      PaperProps={{
+        sx: {
+          background: "linear-gradient(135deg, #667eea 0%, #764ba2 100%)",
+          color: "white",
+        },
+      }}
+    >
+      <DialogTitle>
+        <Stack direction="row" justifyContent="space-between" alignItems="center">
+          <Box>
+            <Typography variant="h6" sx={{ fontWeight: 600, mb: 0.5 }}>
+              Audio Generation Settings
+            </Typography>
+            {sceneTitle && (
+              <Typography variant="body2" sx={{ opacity: 0.8 }}>
+                Configure voice settings for "{sceneTitle}"
+              </Typography>
+            )}
+          </Box>
+          <IconButton onClick={onClose} size="small" sx={{ color: "rgba(255,255,255,0.7)" }}>
+            <CloseIcon />
+          </IconButton>
+        </Stack>
+        <Typography variant="body2" sx={{ opacity: 0.7, mt: 1 }}>
+          Customize voice, tone, and quality for better audio results. Changes apply only to this scene.
+        </Typography>
+      </DialogTitle>
+
+      <DialogContent>
+        <Stack spacing={3} sx={{ mt: 1 }}>
+          {/* Voice Selection */}
+          <Box>
+            <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 1 }}>
+              <Typography variant="subtitle1" sx={{ fontWeight: 600 }}>
+                Voice
+              </Typography>
+              <Tooltip title="Choose from professional voice options. Each voice has unique characteristics for different content types." arrow>
+                <IconButton size="small" sx={{ color: "rgba(255,255,255,0.5)" }}>
+                  <HelpOutlineIcon fontSize="small" />
+                </IconButton>
+              </Tooltip>
+            </Stack>
+            <FormControl fullWidth>
+              <Select
+                value={settings.voiceId}
+                onChange={(e) => setSettings({ ...settings, voiceId: e.target.value })}
+                sx={{
+                  backgroundColor: alpha("#ffffff", 0.1),
+                  color: "white",
+                  "& .MuiOutlinedInput-notchedOutline": { borderColor: "rgba(255,255,255,0.3)" },
+                  "&:hover .MuiOutlinedInput-notchedOutline": { borderColor: "rgba(255,255,255,0.4)" },
+                  "&.Mui-focused .MuiOutlinedInput-notchedOutline": { borderColor: "#ffffff" },
+                  "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                }}
+              >
+                {VOICE_OPTIONS.map((voice) => (
+                  <MenuItem key={voice} value={voice}>
+                    {voice.replace('_', ' ')}
+                  </MenuItem>
+                ))}
+              </Select>
+            </FormControl>
+          </Box>
+
+          {/* Speed / Volume / Pitch */}
+          <Stack direction={{ xs: "column", sm: "row" }} spacing={2}>
+            <Box sx={{ flex: 1 }}>
+              <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 0.5 }}>
+                <Typography variant="subtitle2" sx={{ fontWeight: 600 }}>
+                  Speed ({settings.speed.toFixed(2)})
+                </Typography>
+                <Tooltip title="How fast the voice speaks. 1.0 = normal speed. Lower for narration, higher for conversational." arrow>
+                  <HelpOutlineIcon fontSize="small" sx={{ color: "rgba(255,255,255,0.5)" }} />
+                </Tooltip>
+              </Stack>
+              <Slider
+                value={settings.speed}
+                min={0.5}
+                max={2.0}
+                step={0.05}
+                onChange={(_, v) => setSettings({ ...settings, speed: v as number })}
+                sx={{ color: "#4ade80" }}
+              />
+              <Typography variant="caption" sx={{ opacity: 0.7 }}>
+                0.5 = Slower (narrative) • 1.0 = Normal • 2.0 = Faster (energetic)
+              </Typography>
+            </Box>
+
+            <Box sx={{ flex: 1 }}>
+              <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 0.5 }}>
+                <Typography variant="subtitle2" sx={{ fontWeight: 600 }}>
+                  Volume ({settings.volume.toFixed(1)})
+                </Typography>
+                <Tooltip title="Loudness of the voice. 1.0 = normal volume." arrow>
+                  <HelpOutlineIcon fontSize="small" sx={{ color: "rgba(255,255,255,0.5)" }} />
+                </Tooltip>
+              </Stack>
+              <Slider
+                value={settings.volume}
+                min={0.1}
+                max={10.0}
+                step={0.1}
+                onChange={(_, v) => setSettings({ ...settings, volume: v as number })}
+                sx={{ color: "#fbbf24" }}
+              />
+              <Typography variant="caption" sx={{ opacity: 0.7 }}>
+                0.1 = Very soft • 1.0 = Normal • 10.0 = Very loud
+              </Typography>
+            </Box>
+
+            <Box sx={{ flex: 1 }}>
+              <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 0.5 }}>
+                <Typography variant="subtitle2" sx={{ fontWeight: 600 }}>
+                  Pitch ({settings.pitch})
+                </Typography>
+                <Tooltip title="Tone of the voice. 0 = neutral. Negative = deeper, positive = higher pitched." arrow>
+                  <HelpOutlineIcon fontSize="small" sx={{ color: "rgba(255,255,255,0.5)" }} />
+                </Tooltip>
+              </Stack>
+              <Slider
+                value={settings.pitch}
+                min={-12}
+                max={12}
+                step={0.5}
+                onChange={(_, v) => setSettings({ ...settings, pitch: v as number })}
+                sx={{ color: "#f87171" }}
+              />
+              <Typography variant="caption" sx={{ opacity: 0.7 }}>
+                -12 = Very deep • 0 = Normal • +12 = Very high
+              </Typography>
+            </Box>
+          </Stack>
+
+          {/* Emotion */}
+          <Box>
+            <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 1 }}>
+              <Typography variant="subtitle1" sx={{ fontWeight: 600 }}>
+                Emotion
+              </Typography>
+              <Tooltip title="Sets the vocal mood and emotional delivery style." arrow>
+                <IconButton size="small" sx={{ color: "rgba(255,255,255,0.5)" }}>
+                  <HelpOutlineIcon fontSize="small" />
+                </IconButton>
+              </Tooltip>
+            </Stack>
+            <FormControl fullWidth>
+              <Select
+                value={settings.emotion}
+                onChange={(e) => setSettings({ ...settings, emotion: e.target.value })}
+                sx={{
+                  backgroundColor: alpha("#ffffff", 0.1),
+                  color: "white",
+                  "& .MuiOutlinedInput-notchedOutline": { borderColor: "rgba(255,255,255,0.3)" },
+                  "&:hover .MuiOutlinedInput-notchedOutline": { borderColor: "rgba(255,255,255,0.4)" },
+                  "&.Mui-focused .MuiOutlinedInput-notchedOutline": { borderColor: "#ffffff" },
+                  "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                }}
+              >
+                {EMOTION_OPTIONS.map((emotion) => (
+                  <MenuItem key={emotion} value={emotion}>
+                    {emotion.charAt(0).toUpperCase() + emotion.slice(1)}
+                  </MenuItem>
+                ))}
+              </Select>
+            </FormControl>
+            <Typography variant="caption" sx={{ opacity: 0.7, mt: 0.5, display: "block" }}>
+              Choose emotion that matches your content: happy for upbeat, neutral for professional, sad for serious topics.
+            </Typography>
+          </Box>
+
+          {/* Language & Normalization */}
+          <Stack direction={{ xs: "column", sm: "row" }} spacing={2}>
+            <Box sx={{ flex: 1 }}>
+              <FormControlLabel
+                control={
+                  <Checkbox
+                    checked={settings.englishNormalization}
+                    onChange={(e) => setSettings({ ...settings, englishNormalization: e.target.checked })}
+                    sx={{ color: "rgba(255,255,255,0.7)" }}
+                  />
+                }
+                label={
+                  <Typography variant="body2" sx={{ color: "white" }}>
+                    English normalization
+                  </Typography>
+                }
+              />
+              <Typography variant="caption" sx={{ opacity: 0.7 }}>
+                Improves pronunciation of numbers, dates, and technical terms.
+              </Typography>
+            </Box>
+
+            <Box sx={{ flex: 1 }}>
+              <TextField
+                select
+                fullWidth
+                label="Language boost"
+                value={settings.languageBoost || "auto"}
+                onChange={(e) => setSettings({ ...settings, languageBoost: e.target.value })}
+                InputLabelProps={{ sx: { color: "rgba(255,255,255,0.7)" } }}
+                sx={{
+                  "& .MuiOutlinedInput-root": {
+                    backgroundColor: alpha("#ffffff", 0.1),
+                    color: "white",
+                    "& fieldset": { borderColor: "rgba(255,255,255,0.3)" },
+                    "&:hover fieldset": { borderColor: "rgba(255,255,255,0.4)" },
+                    "&.Mui-focused fieldset": { borderColor: "#ffffff" },
+                    "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                  },
+                }}
+              >
+                {LANGUAGE_BOOST_OPTIONS.map((option) => (
+                  <MenuItem key={option} value={option}>
+                    {option}
+                  </MenuItem>
+                ))}
+              </TextField>
+              <Typography variant="caption" sx={{ opacity: 0.7, mt: 0.5, display: "block" }}>
+                Enhances pronunciation for specific languages.
+              </Typography>
+            </Box>
+          </Stack>
+
+          {/* Quality Settings */}
+          <Stack direction={{ xs: "column", sm: "row" }} spacing={2}>
+            <Box sx={{ flex: 1 }}>
+              <TextField
+                select
+                fullWidth
+                label="Sample rate"
+                value={settings.sampleRate || 24000}
+                onChange={(e) => setSettings({ ...settings, sampleRate: Number(e.target.value) })}
+                InputLabelProps={{ sx: { color: "rgba(255,255,255,0.7)" } }}
+                sx={{
+                  "& .MuiOutlinedInput-root": {
+                    backgroundColor: alpha("#ffffff", 0.1),
+                    color: "white",
+                    "& fieldset": { borderColor: "rgba(255,255,255,0.3)" },
+                    "&:hover fieldset": { borderColor: "rgba(255,255,255,0.4)" },
+                    "&.Mui-focused fieldset": { borderColor: "#ffffff" },
+                    "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                  },
+                }}
+              >
+                {SAMPLE_RATE_OPTIONS.map((rate) => (
+                  <MenuItem key={rate} value={rate}>
+                    {rate} Hz
+                  </MenuItem>
+                ))}
+              </TextField>
+              <Typography variant="caption" sx={{ opacity: 0.7, mt: 0.5, display: "block" }}>
+                Higher = better quality, larger files. 24kHz recommended for voice.
+              </Typography>
+            </Box>
+
+            <Box sx={{ flex: 1 }}>
+              <TextField
+                select
+                fullWidth
+                label="Bitrate"
+                value={settings.bitrate}
+                onChange={(e) => setSettings({ ...settings, bitrate: Number(e.target.value) })}
+                InputLabelProps={{ sx: { color: "rgba(255,255,255,0.7)" } }}
+                sx={{
+                  "& .MuiOutlinedInput-root": {
+                    backgroundColor: alpha("#ffffff", 0.1),
+                    color: "white",
+                    "& fieldset": { borderColor: "rgba(255,255,255,0.3)" },
+                    "&:hover fieldset": { borderColor: "rgba(255,255,255,0.4)" },
+                    "&.Mui-focused fieldset": { borderColor: "#ffffff" },
+                    "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                  },
+                }}
+              >
+                {BITRATE_OPTIONS.map((bitrate) => (
+                  <MenuItem key={bitrate} value={bitrate}>
+                    {bitrate / 1000} kbps
+                  </MenuItem>
+                ))}
+              </TextField>
+              <Typography variant="caption" sx={{ opacity: 0.7, mt: 0.5, display: "block" }}>
+                Higher = clearer audio, larger files. 128kbps recommended.
+              </Typography>
+            </Box>
+          </Stack>
+
+          {/* Format & Channel */}
+          <Stack direction={{ xs: "column", sm: "row" }} spacing={2}>
+            <Box sx={{ flex: 1 }}>
+              <TextField
+                select
+                fullWidth
+                label="Channel"
+                value={settings.channel}
+                onChange={(e) => setSettings({ ...settings, channel: e.target.value as "1" | "2" })}
+                InputLabelProps={{ sx: { color: "rgba(255,255,255,0.7)" } }}
+                sx={{
+                  "& .MuiOutlinedInput-root": {
+                    backgroundColor: alpha("#ffffff", 0.1),
+                    color: "white",
+                    "& fieldset": { borderColor: "rgba(255,255,255,0.3)" },
+                    "&:hover fieldset": { borderColor: "rgba(255,255,255,0.4)" },
+                    "&.Mui-focused fieldset": { borderColor: "#ffffff" },
+                    "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                  },
+                }}
+              >
+                <MenuItem value="1">Mono (smaller files, voice-focused)</MenuItem>
+                <MenuItem value="2">Stereo (better spatial audio)</MenuItem>
+              </TextField>
+            </Box>
+
+            <Box sx={{ flex: 1 }}>
+              <TextField
+                select
+                fullWidth
+                label="Format"
+                value={settings.format}
+                onChange={(e) => setSettings({ ...settings, format: e.target.value as "mp3" | "wav" | "pcm" | "flac" })}
+                InputLabelProps={{ sx: { color: "rgba(255,255,255,0.7)" } }}
+                sx={{
+                  "& .MuiOutlinedInput-root": {
+                    backgroundColor: alpha("#ffffff", 0.1),
+                    color: "white",
+                    "& fieldset": { borderColor: "rgba(255,255,255,0.3)" },
+                    "&:hover fieldset": { borderColor: "rgba(255,255,255,0.4)" },
+                    "&.Mui-focused fieldset": { borderColor: "#ffffff" },
+                    "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                  },
+                }}
+              >
+                <MenuItem value="mp3">MP3 (compressed, universal)</MenuItem>
+                <MenuItem value="wav">WAV (uncompressed, high quality)</MenuItem>
+                <MenuItem value="pcm">PCM (raw audio data)</MenuItem>
+                <MenuItem value="flac">FLAC (lossless compression)</MenuItem>
+              </TextField>
+            </Box>
+          </Stack>
+
+          {/* Sync Mode */}
+          <Box>
+            <FormControlLabel
+              control={
+                <Checkbox
+                  checked={settings.enableSyncMode}
+                  onChange={(e) => setSettings({ ...settings, enableSyncMode: e.target.checked })}
+                  sx={{ color: "rgba(255,255,255,0.7)" }}
+                />
+              }
+              label={
+                <Typography variant="body2" sx={{ color: "white" }}>
+                  Enable sync mode (faster, recommended)
+                </Typography>
+              }
+            />
+            <Typography variant="caption" sx={{ opacity: 0.7 }}>
+              Waits for audio completion before returning. Recommended for most use cases.
+            </Typography>
+          </Box>
+        </Stack>
+      </DialogContent>
+
+      <DialogActions sx={{ p: 3, pt: 2 }}>
+        <Button
+          onClick={onClose}
+          disabled={isGenerating}
+          sx={{ color: "rgba(255,255,255,0.7)" }}
+        >
+          Cancel
+        </Button>
+        <Button
+          onClick={handleApply}
+          variant="contained"
+          disabled={isGenerating}
+          startIcon={isGenerating ? undefined : <VolumeUp />}
+          sx={{
+            backgroundColor: "#4ade80",
+            "&:hover": { backgroundColor: "#22c55e" },
+            "&:disabled": { backgroundColor: "rgba(255,255,255,0.2)" },
+          }}
+        >
+          {isGenerating ? "Generating..." : "Apply Settings & Generate"}
+        </Button>
+      </DialogActions>
+    </Dialog>
+  );
+};
--- a/frontend/src/components/YouTubeCreator/components/CombinedSceneOverview.tsx
+++ b/frontend/src/components/YouTubeCreator/components/CombinedSceneOverview.tsx
@@ -18,7 +18,7 @@ import {
  IconButton,
  Alert,
 } from '@mui/material';
-import { HelpOutline, Timeline, BarChart, AccessTime, Movie, Info } from '@mui/icons-material';
+import { HelpOutline, Timeline, BarChart, AccessTime, Movie, Info, Image as ImageIcon, VolumeUp, CheckCircle } from '@mui/icons-material';
 import { Scene } from '../../../services/youtubeApi';
 import { getSceneIcon, getSceneColor, getSceneTypeLabel, formatDuration } from '../utils/sceneHelpers';

@@ -40,6 +40,12 @@ export const CombinedSceneOverview: React.FC<CombinedSceneOverviewProps> = React
      return acc;
    }, {} as Record<string, number>);

+    // Asset readiness stats
+    const scenesWithImages = enabledScenes.filter(s => s.imageUrl).length;
+    const scenesWithAudio = enabledScenes.filter(s => s.audioUrl).length;
+    const scenesWithBoth = enabledScenes.filter(s => s.imageUrl && s.audioUrl).length;
+    const allReady = enabledScenes.length > 0 && scenesWithBoth === enabledScenes.length;
+
    return {
      totalScenes: scenes.length,
      enabledScenes: enabledScenes.length,
@@ -47,6 +53,10 @@ export const CombinedSceneOverview: React.FC<CombinedSceneOverviewProps> = React
      averageDuration,
      sceneBreakdown,
      enabledScenesList: enabledScenes,
+      scenesWithImages,
+      scenesWithAudio,
+      scenesWithBoth,
+      allReady,
    };
  }, [scenes]);

@@ -191,6 +201,79 @@ export const CombinedSceneOverview: React.FC<CombinedSceneOverviewProps> = React

                <Divider sx={{ my: 0.5 }} />

+                {/* Asset Readiness */}
+                <Box>
+                  <Typography
+                    variant="caption"
+                    sx={{
+                      fontWeight: 600,
+                      color: '#6b7280',
+                      fontSize: '0.75rem',
+                      textTransform: 'uppercase',
+                      letterSpacing: '0.05em',
+                      display: 'block',
+                      mb: 1,
+                    }}
+                  >
+                    Asset Status
+                  </Typography>
+                  <Stack direction="row" spacing={1} flexWrap="wrap" useFlexGap>
+                    <Tooltip
+                      title="Number of scenes with AI-generated images ready"
+                      arrow
+                    >
+                      <Chip
+                        icon={<ImageIcon sx={{ fontSize: 14 }} />}
+                        label={`${stats.scenesWithImages}/${stats.enabledScenes} Images`}
+                        size="small"
+                        sx={{
+                          fontWeight: 500,
+                          fontSize: '0.75rem',
+                          bgcolor: stats.scenesWithImages === stats.enabledScenes ? '#d1fae5' : '#fef3c7',
+                          color: stats.scenesWithImages === stats.enabledScenes ? '#065f46' : '#92400e',
+                          border: `1px solid ${stats.scenesWithImages === stats.enabledScenes ? '#10b981' : '#f59e0b'}`,
+                        }}
+                      />
+                    </Tooltip>
+                    <Tooltip
+                      title="Number of scenes with audio narration ready"
+                      arrow
+                    >
+                      <Chip
+                        icon={<VolumeUp sx={{ fontSize: 14 }} />}
+                        label={`${stats.scenesWithAudio}/${stats.enabledScenes} Audio`}
+                        size="small"
+                        sx={{
+                          fontWeight: 500,
+                          fontSize: '0.75rem',
+                          bgcolor: stats.scenesWithAudio === stats.enabledScenes ? '#d1fae5' : '#fef3c7',
+                          color: stats.scenesWithAudio === stats.enabledScenes ? '#065f46' : '#92400e',
+                          border: `1px solid ${stats.scenesWithAudio === stats.enabledScenes ? '#10b981' : '#f59e0b'}`,
+                        }}
+                      />
+                    </Tooltip>
+                    {stats.allReady && (
+                      <Tooltip
+                        title="All scenes are ready for video generation!"
+                        arrow
+                      >
+                        <Chip
+                          icon={<CheckCircle sx={{ fontSize: 14 }} />}
+                          label="All Ready"
+                          size="small"
+                          color="success"
+                          sx={{
+                            fontWeight: 600,
+                            fontSize: '0.75rem',
+                          }}
+                        />
+                      </Tooltip>
+                    )}
+                  </Stack>
+                </Box>
+
+                <Divider sx={{ my: 0.5 }} />
+
                {/* Scene Type Breakdown */}
                <Box>
                  <Typography
--- a/frontend/src/components/YouTubeCreator/components/CostEstimateCard.tsx
+++ b/frontend/src/components/YouTubeCreator/components/CostEstimateCard.tsx
@@ -1,43 +1,91 @@
 /**
 * Cost Estimate Card Component
 * 
- * Displays professional cost estimate with breakdown and per-scene costs.
+ * Displays user-friendly cost estimate with clear breakdown and explanations.
 */

-import React from 'react';
+import React, { useMemo } from 'react';
 import {
  Box,
  Typography,
  Stack,
  CircularProgress,
  Alert,
+  Chip,
+  Divider,
 } from '@mui/material';
-import { CostEstimate } from '../../../services/youtubeApi';
+import {
+  MonetizationOn as MoneyIcon,
+  VideoLibrary as VideoIcon,
+  Image as ImageIcon,
+  Info as InfoIcon,
+} from '@mui/icons-material';
+import { CostEstimate, Scene } from '../../../services/youtubeApi';

 interface CostEstimateCardProps {
  costEstimate: CostEstimate | null;
  loadingCostEstimate: boolean;
+  scenes?: Scene[];
 }

 export const CostEstimateCard: React.FC<CostEstimateCardProps> = React.memo(({
  costEstimate,
  loadingCostEstimate,
+  scenes = [],
 }) => {
+  // Calculate total image cost if available
+  const totalImageCost = useMemo(() => {
+    if (!costEstimate) return 0;
+    return costEstimate.total_image_cost || 
+           (costEstimate.image_cost_per_scene ? costEstimate.num_scenes * costEstimate.image_cost_per_scene : 0);
+  }, [costEstimate]);
+
+  // Calculate video rendering cost
+  const videoRenderCost = useMemo(() => {
+    if (!costEstimate) return 0;
+    return costEstimate.total_cost - totalImageCost;
+  }, [costEstimate, totalImageCost]);
+
  if (loadingCostEstimate) {
    return (
      <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mt: 2 }}>
        <CircularProgress size={16} />
        <Typography variant="body2" color="text.secondary">
-          Calculating cost estimate...
+          Calculating your video cost...
        </Typography>
      </Box>
    );
  }

  if (!costEstimate) {
+    // Check which scenes are missing assets
+    const enabledScenes = scenes.filter(s => s.enabled !== false);
+    const scenesMissingImage = enabledScenes.filter(s => !s.imageUrl);
+    const scenesMissingAudio = enabledScenes.filter(s => !s.audioUrl);
+    const scenesMissingBoth = enabledScenes.filter(s => !s.imageUrl && !s.audioUrl);
+    
+    let errorMessage = 'Please ensure all enabled scenes have images and audio.';
+    if (scenesMissingBoth.length > 0 || scenesMissingImage.length > 0 || scenesMissingAudio.length > 0) {
+      const missingDetails: string[] = [];
+      if (scenesMissingImage.length > 0) {
+        missingDetails.push(`${scenesMissingImage.length} scene${scenesMissingImage.length !== 1 ? 's' : ''} missing image${scenesMissingImage.length !== 1 ? 's' : ''}`);
+      }
+      if (scenesMissingAudio.length > 0) {
+        missingDetails.push(`${scenesMissingAudio.length} scene${scenesMissingAudio.length !== 1 ? 's' : ''} missing audio`);
+      }
+      if (missingDetails.length > 0) {
+        errorMessage = `Unable to calculate cost: ${missingDetails.join(', ')}. Go back to "Generate Assets" step to create missing assets.`;
+      }
+    }
+    
    return (
      <Alert severity="warning" sx={{ mt: 2 }}>
-        Unable to calculate cost estimate. Please check your scenes and try again.
+        <Typography variant="body2" sx={{ fontWeight: 500 }}>
+          Unable to calculate cost estimate
+        </Typography>
+        <Typography variant="caption" sx={{ display: 'block', mt: 0.5 }}>
+          {errorMessage}
+        </Typography>
      </Alert>
    );
  }
@@ -47,33 +95,56 @@ export const CostEstimateCard: React.FC<CostEstimateCardProps> = React.memo(({
      sx={{
        mt: 3,
        p: 3,
-        bgcolor: '#ffffff',
-        borderRadius: 2,
-        border: '2px solid #e5e7eb',
-        boxShadow: '0 1px 3px 0 rgba(0, 0, 0, 0.1)',
+        bgcolor: 'linear-gradient(135deg, #667eea 0%, #764ba2 100%)',
+        background: 'linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%)',
+        borderRadius: 3,
+        border: '2px solid #667eea',
+        boxShadow: '0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06)',
      }}
    >
-      <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 2 }}>
-        <Typography
-          variant="h6"
-          sx={{
-            fontWeight: 700,
-            fontSize: '1rem',
-            color: '#111827',
-            letterSpacing: '-0.01em',
-          }}
-        >
-          Estimated Cost
-        </Typography>
+      {/* Header */}
+      <Box sx={{ display: 'flex', alignItems: 'center', gap: 1.5, mb: 3 }}>
+        <MoneyIcon sx={{ color: '#667eea', fontSize: 28 }} />
+        <Box>
+          <Typography
+            variant="h6"
+            sx={{
+              fontWeight: 700,
+              fontSize: '1.1rem',
+              color: '#1e293b',
+              letterSpacing: '-0.01em',
+            }}
+          >
+            💰 Total Cost Estimate
+          </Typography>
+          <Typography
+            variant="caption"
+            sx={{
+              color: '#64748b',
+              fontSize: '0.75rem',
+            }}
+          >
+            What you'll pay to create this video
+          </Typography>
+        </Box>
      </Box>
      
-      <Box sx={{ mb: 2.5 }}>
+      {/* Total Cost Display */}
+      <Box 
+        sx={{ 
+          mb: 3, 
+          p: 2.5,
+          bgcolor: 'white',
+          borderRadius: 2,
+          boxShadow: '0 2px 4px rgba(0, 0, 0, 0.08)',
+        }}
+      >
        <Typography
-          variant="h4"
+          variant="h3"
          sx={{
-            fontWeight: 700,
-            fontSize: '2rem',
-            color: '#111827',
+            fontWeight: 800,
+            fontSize: '2.5rem',
+            color: '#667eea',
            lineHeight: 1.2,
            mb: 0.5,
          }}
@@ -83,131 +154,339 @@ export const CostEstimateCard: React.FC<CostEstimateCardProps> = React.memo(({
        <Typography
          variant="body2"
          sx={{
-            color: '#6b7280',
+            color: '#64748b',
            fontSize: '0.875rem',
            fontWeight: 500,
          }}
        >
-          Range: ${costEstimate.estimated_cost_range.min.toFixed(2)} - ${costEstimate.estimated_cost_range.max.toFixed(2)}
+          Estimated range: ${costEstimate.estimated_cost_range.min.toFixed(2)} - ${costEstimate.estimated_cost_range.max.toFixed(2)}
+        </Typography>
+        <Typography
+          variant="caption"
+          sx={{
+            color: '#94a3b8',
+            fontSize: '0.75rem',
+            display: 'block',
+            mt: 0.5,
+          }}
+        >
+          Final cost may vary by ±10% based on actual processing
        </Typography>
      </Box>

+      {/* What's Included Section */}
      <Box
        sx={{
-          p: 2,
-          bgcolor: '#f9fafb',
-          borderRadius: 1.5,
-          border: '1px solid #e5e7eb',
-          mb: 2,
+          p: 2.5,
+          bgcolor: 'white',
+          borderRadius: 2,
+          mb: 2.5,
+          boxShadow: '0 1px 3px rgba(0, 0, 0, 0.08)',
        }}
      >
        <Typography
-          variant="body2"
+          variant="subtitle2"
          sx={{
-            color: '#374151',
-            fontSize: '0.875rem',
-            lineHeight: 1.6,
-            mb: 0.5,
+            color: '#1e293b',
+            fontWeight: 700,
+            mb: 2,
+            fontSize: '0.95rem',
+            display: 'flex',
+            alignItems: 'center',
+            gap: 1,
          }}
        >
-          <strong>{costEstimate.num_scenes} scenes</strong> × <strong>${costEstimate.price_per_second.toFixed(2)}/second</strong>
+          <InfoIcon sx={{ fontSize: 18, color: '#667eea' }} />
+          What's Included in This Price
        </Typography>
-        <Typography
-          variant="body2"
+
+        <Stack spacing={2}>
+          {/* Video Rendering */}
+          <Box>
+            <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 0.5 }}>
+              <VideoIcon sx={{ fontSize: 20, color: '#667eea' }} />
+              <Typography
+                variant="body2"
+                sx={{
+                  color: '#1e293b',
+                  fontWeight: 600,
+                  fontSize: '0.875rem',
+                }}
+              >
+                AI Video Generation
+              </Typography>
+              <Chip 
+                label={`$${videoRenderCost.toFixed(2)}`}
+                size="small"
+                sx={{ 
+                  ml: 'auto',
+                  bgcolor: '#667eea',
+                  color: 'white',
+                  fontWeight: 600,
+                  fontSize: '0.75rem',
+                }}
+              />
+            </Box>
+            <Typography
+              variant="body2"
+              sx={{
+                color: '#64748b',
+                fontSize: '0.8rem',
+                lineHeight: 1.5,
+                ml: 3.5,
+              }}
+            >
+              Creating <strong>{costEstimate.num_scenes} video scenes</strong> ({Math.round(costEstimate.total_duration_seconds)} seconds total) at <strong>{costEstimate.resolution}</strong> quality
+            </Typography>
+            <Typography
+              variant="caption"
+              sx={{
+                color: '#94a3b8',
+                fontSize: '0.7rem',
+                display: 'block',
+                ml: 3.5,
+                mt: 0.5,
+              }}
+            >
+              Rate: ${costEstimate.price_per_second.toFixed(2)}/second • Using advanced AI to transform your narration into engaging video scenes
+            </Typography>
+          </Box>
+
+          {/* Image Generation (if applicable) */}
+          {totalImageCost > 0 && (
+            <>
+              <Divider sx={{ my: 0.5 }} />
+              <Box>
+                <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 0.5 }}>
+                  <ImageIcon sx={{ fontSize: 20, color: '#10b981' }} />
+                  <Typography
+                    variant="body2"
+                    sx={{
+                      color: '#1e293b',
+                      fontWeight: 600,
+                      fontSize: '0.875rem',
+                    }}
+                  >
+                    Scene Images
+                  </Typography>
+                  <Chip 
+                    label={`$${totalImageCost.toFixed(2)}`}
+                    size="small"
+                    sx={{ 
+                      ml: 'auto',
+                      bgcolor: '#10b981',
+                      color: 'white',
+                      fontWeight: 600,
+                      fontSize: '0.75rem',
+                    }}
+                  />
+                </Box>
+                <Typography
+                  variant="body2"
+                  sx={{
+                    color: '#64748b',
+                    fontSize: '0.8rem',
+                    lineHeight: 1.5,
+                    ml: 3.5,
+                  }}
+                >
+                  Generating <strong>{costEstimate.num_scenes} custom images</strong> for your video scenes
+                  {costEstimate.image_model && ` using ${costEstimate.image_model}`}
+                </Typography>
+                {costEstimate.image_cost_per_scene && (
+                  <Typography
+                    variant="caption"
+                    sx={{
+                      color: '#94a3b8',
+                      fontSize: '0.7rem',
+                      display: 'block',
+                      ml: 3.5,
+                      mt: 0.5,
+                    }}
+                  >
+                    Rate: ${costEstimate.image_cost_per_scene.toFixed(2)}/image • High-quality AI-generated visuals tailored to your content
+                  </Typography>
+                )}
+              </Box>
+            </>
+          )}
+        </Stack>
+
+        {/* Summary Box */}
+        <Box
          sx={{
-            color: '#374151',
-            fontSize: '0.875rem',
-            lineHeight: 1.6,
-            mb: 0.5,
+            mt: 2,
+            p: 1.5,
+            bgcolor: '#f1f5f9',
+            borderRadius: 1.5,
+            border: '1px solid #cbd5e1',
          }}
        >
-          Total duration: <strong>~{Math.round(costEstimate.total_duration_seconds)} seconds</strong>
-        </Typography>
-        <Typography
-          variant="body2"
-          sx={{
-            color: '#374151',
-            fontSize: '0.875rem',
-            lineHeight: 1.6,
-          }}
-        >
-          Price per second: <strong>${costEstimate.price_per_second.toFixed(2)}</strong> ({costEstimate.resolution})
-        </Typography>
+          <Typography
+            variant="caption"
+            sx={{
+              color: '#475569',
+              fontSize: '0.75rem',
+              lineHeight: 1.6,
+              display: 'block',
+            }}
+          >
+            💡 <strong>Good to know:</strong> You only pay for the AI processing to create your video. 
+            There are no hidden fees, subscription requirements, or storage charges. 
+            Once created, your video is yours to download and use forever!
+          </Typography>
+        </Box>
      </Box>

+      {/* Per Scene Breakdown (Optional, collapsible) */}
      {costEstimate.scene_costs.length > 0 && (
        <Box
          sx={{
-            pt: 2,
-            borderTop: '2px solid #e5e7eb',
+            p: 2.5,
+            bgcolor: 'white',
+            borderRadius: 2,
+            boxShadow: '0 1px 3px rgba(0, 0, 0, 0.08)',
          }}
        >
          <Typography
            variant="subtitle2"
            sx={{
-              fontWeight: 600,
+              fontWeight: 700,
              fontSize: '0.875rem',
-              color: '#111827',
+              color: '#1e293b',
              mb: 1.5,
-              textTransform: 'uppercase',
-              letterSpacing: '0.05em',
+              display: 'flex',
+              alignItems: 'center',
+              gap: 1,
            }}
          >
-            Per Scene Breakdown
+            📊 Cost Per Scene
+            <Typography
+              component="span"
+              variant="caption"
+              sx={{
+                ml: 'auto',
+                color: '#64748b',
+                fontWeight: 500,
+              }}
+            >
+              {costEstimate.scene_costs.length} scenes
+            </Typography>
          </Typography>
-          <Stack spacing={0.75}>
-            {costEstimate.scene_costs.slice(0, 5).map((sceneCost) => (
+          
+          <Stack spacing={1}>
+            {costEstimate.scene_costs.slice(0, 5).map((sceneCost, idx) => (
              <Box
                key={sceneCost.scene_number}
                sx={{
                  display: 'flex',
                  justifyContent: 'space-between',
                  alignItems: 'center',
-                  py: 0.75,
+                  py: 1,
                  px: 1.5,
-                  bgcolor: '#ffffff',
+                  bgcolor: idx % 2 === 0 ? '#f8fafc' : '#ffffff',
                  borderRadius: 1,
-                  border: '1px solid #e5e7eb',
+                  border: '1px solid #e2e8f0',
+                  transition: 'all 0.2s',
+                  '&:hover': {
+                    bgcolor: '#f1f5f9',
+                    borderColor: '#cbd5e1',
+                  },
+                }}
+              >
+                <Box>
+                  <Typography
+                    variant="body2"
+                    sx={{
+                      color: '#1e293b',
+                      fontSize: '0.875rem',
+                      fontWeight: 600,
+                    }}
+                  >
+                    Scene {sceneCost.scene_number}
+                  </Typography>
+                  <Typography
+                    variant="caption"
+                    sx={{
+                      color: '#64748b',
+                      fontSize: '0.7rem',
+                    }}
+                  >
+                    {sceneCost.actual_duration}s video
+                    {sceneCost.duration_estimate !== sceneCost.actual_duration && 
+                      ` (optimized from ${sceneCost.duration_estimate}s)`}
+                  </Typography>
+                </Box>
+                <Chip
+                  label={`$${sceneCost.cost.toFixed(2)}`}
+                  size="small"
+                  sx={{
+                    bgcolor: '#667eea',
+                    color: 'white',
+                    fontWeight: 600,
+                    fontSize: '0.75rem',
+                  }}
+                />
+              </Box>
+            ))}
+            
+            {costEstimate.scene_costs.length > 5 && (
+              <Box
+                sx={{
+                  py: 1,
+                  textAlign: 'center',
+                  bgcolor: '#f8fafc',
+                  borderRadius: 1,
+                  border: '1px dashed #cbd5e1',
                }}
              >
                <Typography
                  variant="body2"
                  sx={{
-                    color: '#374151',
-                    fontSize: '0.875rem',
+                    color: '#64748b',
+                    fontSize: '0.8rem',
                    fontWeight: 500,
                  }}
                >
-                  Scene {sceneCost.scene_number}: {sceneCost.actual_duration}s
+                  + {costEstimate.scene_costs.length - 5} more scenes
                </Typography>
                <Typography
-                  variant="body2"
+                  variant="caption"
                  sx={{
-                    color: '#111827',
-                    fontSize: '0.875rem',
-                    fontWeight: 600,
+                    color: '#94a3b8',
+                    fontSize: '0.7rem',
                  }}
                >
-                  ${sceneCost.cost.toFixed(2)}
+                  (scroll down after rendering to see all scenes)
                </Typography>
              </Box>
-            ))}
-            {costEstimate.scene_costs.length > 5 && (
-              <Typography
-                variant="body2"
-                sx={{
-                  color: '#6b7280',
-                  fontSize: '0.875rem',
-                  textAlign: 'center',
-                  py: 0.5,
-                }}
-              >
-                ... and {costEstimate.scene_costs.length - 5} more scenes
-              </Typography>
            )}
          </Stack>
        </Box>
      )}
+
+      {/* Help Section */}
+      <Alert 
+        severity="info" 
+        icon={<InfoIcon />}
+        sx={{ 
+          mt: 2.5,
+          bgcolor: '#eff6ff',
+          border: '1px solid #bfdbfe',
+          '& .MuiAlert-icon': {
+            color: '#3b82f6',
+          },
+        }}
+      >
+        <Typography variant="body2" sx={{ fontWeight: 600, fontSize: '0.8rem', mb: 0.5 }}>
+          Why does video creation cost money?
+        </Typography>
+        <Typography variant="caption" sx={{ fontSize: '0.75rem', lineHeight: 1.5, display: 'block' }}>
+          Creating videos with AI requires powerful computing resources. Each second of video is generated by 
+          advanced AI models that analyze your script, create visuals, and synchronize everything perfectly. 
+          The cost covers the actual AI processing time needed to bring your content to life.
+        </Typography>
+      </Alert>
    </Box>
  );
 });
--- a/frontend/src/components/YouTubeCreator/components/RenderStep.tsx
+++ b/frontend/src/components/YouTubeCreator/components/RenderStep.tsx
@@ -5,7 +5,7 @@
 * Orchestrates scene overview, settings, cost estimation, and render status.
 */

-import React from 'react';
+import React, { useMemo, useState } from 'react';
 import {
  Paper,
  Typography,
@@ -14,16 +14,22 @@ import {
  Box,
  Alert,
  CircularProgress,
+  Chip,
+  IconButton,
+  Tooltip,
 } from '@mui/material';
-import { PlayArrow } from '@mui/icons-material';
+import { PlayArrow, CheckCircle, Warning, ArrowBack, Visibility, Image as ImageIcon, VolumeUp } from '@mui/icons-material';
 import { motion } from 'framer-motion';
 import { TaskStatus, CostEstimate, VideoPlan, Scene } from '../../../services/youtubeApi';
 import { YT_BORDER, type Resolution } from '../constants';
-import { SceneCard } from './SceneCard';
 import { CombinedSceneOverview } from './CombinedSceneOverview';
 import { CostEstimateCard } from './CostEstimateCard';
 import { RenderSettings } from './RenderSettings';
 import { RenderStatusDisplay } from './RenderStatusDisplay';
+import { ScenePreviewModal } from './ScenePreviewModal';
+import { useYouTubeRenderQueue } from '../hooks/useYouTubeRenderQueue';
+import Snackbar from '@mui/material/Snackbar';
+import MuiAlert, { AlertColor } from '@mui/material/Alert';

 interface RenderStepProps {
  renderTaskId: string | null;
@@ -37,19 +43,13 @@ interface RenderStepProps {
  loading: boolean;
  scenes: Scene[];
  videoPlan: VideoPlan | null;
-  editingSceneId: number | null;
-  editedScene: Partial<Scene> | null;
  onResolutionChange: (resolution: Resolution) => void;
  onCombineScenesChange: (combine: boolean) => void;
  onStartRender: () => void;
  onBack: () => void;
  onReset: () => void;
  onRetryFailedScenes: (failedScenes: any[]) => void;
-  onEditScene: (scene: Scene) => void;
-  onSaveScene: () => void;
-  onCancelEdit: () => void;
-  onEditChange: (updates: Partial<Scene>) => void;
-  onToggleScene: (sceneNumber: number) => void;
+  onScenesUpdate: (updatedScenes: Scene[]) => void;
  getVideoUrl: () => string | null;
 }

@@ -64,21 +64,53 @@ export const RenderStep: React.FC<RenderStepProps> = React.memo(({
  loadingCostEstimate,
  loading,
  scenes,
-  editingSceneId,
-  editedScene,
+  videoPlan,
  onResolutionChange,
  onCombineScenesChange,
  onStartRender,
  onBack,
  onReset,
  onRetryFailedScenes,
-  onEditScene,
-  onSaveScene,
-  onCancelEdit,
-  onEditChange,
-  onToggleScene,
  getVideoUrl,
+  onScenesUpdate,
 }) => {
+  const [snackbar, setSnackbar] = React.useState<{ open: boolean; message: string; severity: AlertColor }>({
+    open: false,
+    message: '',
+    severity: 'info',
+  });
+
+  const [previewModalOpen, setPreviewModalOpen] = useState(false);
+  const [previewScene, setPreviewScene] = useState<Scene | null>(null);
+
+  const showSnackbar = (message: string, severity: AlertColor = 'info') => {
+    setSnackbar({ open: true, message, severity });
+  };
+
+  const handlePreviewScene = (scene: Scene) => {
+    setPreviewScene(scene);
+    setPreviewModalOpen(true);
+  };
+
+  const {
+    sceneStatuses,
+    finalVideoUrl,
+    combining,
+    combiningProgress,
+    combiningMessage,
+    runSceneVideo,
+    combineVideos,
+  } = useYouTubeRenderQueue({
+    scenes,
+    videoPlan,
+    resolution,
+    onScenesUpdate,
+    onError: (msg) => showSnackbar(msg, 'error'),
+    onSuccess: (msg) => showSnackbar(msg, 'success'),
+    onInfo: (msg) => showSnackbar(msg, 'info'),
+  });
+
+  const canStartRender = enabledScenesCount > 0 && !loading;
  return (
    <motion.div
      initial={{ opacity: 0, y: 20 }}
@@ -91,42 +123,259 @@ export const RenderStep: React.FC<RenderStepProps> = React.memo(({
          border: `1px solid ${YT_BORDER}`,
        }}
      >
-        <Typography variant="h5" sx={{ mb: 3, fontWeight: 600 }}>
-          3️⃣ Render Video
-        </Typography>
+                    <Typography variant="h5" sx={{ mb: 3, fontWeight: 600 }}>
+                      4️⃣ Render Final Video
+                    </Typography>

-        {!renderTaskId ? (
-          <Stack spacing={3}>
-            <Alert severity="info">
-              Review your scenes, configure render settings, and start generating your video. This may take several minutes.
-            </Alert>
+                    {!renderTaskId ? (
+                      <Stack spacing={3}>
+                        <Alert severity="info" icon={<CheckCircle />}>
+                          <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
+                            Ready to create your video!
+                          </Typography>
+                          <Typography variant="caption" sx={{ fontSize: '0.75rem' }}>
+                            All scenes have their images and audio. Configure your render settings below and start the video generation process.
+                          </Typography>
+                        </Alert>

            {/* Combined Scene Statistics & Timeline */}
            {scenes.length > 0 && (
              <CombinedSceneOverview scenes={scenes} />
            )}

-            {/* Scene Details - Full descriptions */}
+            {/* Scene-wise Video Generation */}
            {scenes.length > 0 && (
              <Box sx={{ mb: 3 }}>
-                <Typography variant="h6" sx={{ mb: 2, fontWeight: 600, color: '#111827' }}>
-                  Scene Details
+                <Box sx={{ display: 'flex', alignItems: 'center', justifyContent: 'between', mb: 2 }}>
+                  <Typography variant="h6" sx={{ fontWeight: 700, color: '#1e293b', fontSize: '1.1rem' }}>
+                    🎬 Scene Video Generation
+                  </Typography>
+                </Box>
+                <Typography variant="body2" color="text.secondary" sx={{ mb: 2.5 }}>
+                  Generate video for each scene individually. Videos are created using your scene images and audio narration. 
+                  You can preview assets and retry failed scenes.
                </Typography>
                <Stack spacing={2}>
-                  {scenes.map((scene) => (
-                    <SceneCard
-                      key={scene.scene_number}
-                      scene={scene}
-                      isEditing={editingSceneId === scene.scene_number}
-                      editedScene={editedScene}
-                      onToggle={onToggleScene}
-                      onEdit={onEditScene}
-                      onSave={onSaveScene}
-                      onCancel={onCancelEdit}
-                      onEditChange={onEditChange}
-                      loading={loading}
+                  {scenes.filter(s => s.enabled !== false).map((scene) => {
+                    const st = sceneStatuses[scene.scene_number] || { status: 'idle', progress: 0 };
+                    const hasAssets = !!scene.imageUrl && !!scene.audioUrl;
+                    const running = st.status === 'running';
+                    const failed = st.status === 'failed';
+                    const completed = st.status === 'completed';
+                    
+                    return (
+                      <Paper
+                        key={scene.scene_number}
+                        elevation={0}
+                        sx={{ 
+                          p: 3,
+                          border: completed ? '2px solid #10b981' : failed ? '2px solid #ef4444' : '2px solid #e2e8f0',
+                          borderRadius: 2,
+                          bgcolor: completed ? '#f0fdf4' : failed ? '#fef2f2' : 'white',
+                          transition: 'all 0.2s ease-in-out',
+                          '&:hover': {
+                            boxShadow: '0 4px 6px -1px rgba(0, 0, 0, 0.1)',
+                          },
+                        }}
+                      >
+                        <Stack spacing={2}>
+                          {/* Header Row */}
+                          <Box sx={{ display: 'flex', alignItems: 'flex-start', justifyContent: 'space-between', gap: 2 }}>
+                            <Box sx={{ flex: 1, minWidth: 0 }}>
+                              <Typography variant="subtitle1" sx={{ fontWeight: 700, color: '#1e293b', mb: 0.5 }}>
+                                Scene {scene.scene_number}: {scene.title}
+                              </Typography>
+                              <Stack direction="row" spacing={1.5} alignItems="center" flexWrap="wrap" useFlexGap>
+                                <Chip 
+                                  label={`${scene.duration_estimate}s`} 
+                                  size="small" 
+                                  sx={{ 
+                                    fontSize: '0.75rem',
+                                    fontWeight: 600,
+                                    bgcolor: '#eff6ff',
+                                    color: '#1e40af',
+                                  }} 
+                                />
+                                {/* Asset Status Chips */}
+                                <Tooltip 
+                                  title={scene.imageUrl ? "Image ready - click to preview" : "Image not generated yet"} 
+                                  arrow
+                                >
+                                  <Chip
+                                    icon={<ImageIcon sx={{ fontSize: 14 }} />}
+                                    label="Image"
+                                    size="small"
+                                    onClick={scene.imageUrl ? () => handlePreviewScene(scene) : undefined}
+                                    sx={{
+                                      fontSize: '0.75rem',
+                                      fontWeight: 500,
+                                      bgcolor: scene.imageUrl ? '#d1fae5' : '#fee2e2',
+                                      color: scene.imageUrl ? '#065f46' : '#991b1b',
+                                      cursor: scene.imageUrl ? 'pointer' : 'default',
+                                      '&:hover': scene.imageUrl ? {
+                                        bgcolor: '#a7f3d0',
+                                      } : {},
+                                    }}
+                                  />
+                                </Tooltip>
+                                <Tooltip 
+                                  title={scene.audioUrl ? "Audio ready - click to preview" : "Audio not generated yet"} 
+                                  arrow
+                                >
+                                  <Chip
+                                    icon={<VolumeUp sx={{ fontSize: 14 }} />}
+                                    label="Audio"
+                                    size="small"
+                                    onClick={scene.audioUrl ? () => handlePreviewScene(scene) : undefined}
+                                    sx={{
+                                      fontSize: '0.75rem',
+                                      fontWeight: 500,
+                                      bgcolor: scene.audioUrl ? '#d1fae5' : '#fee2e2',
+                                      color: scene.audioUrl ? '#065f46' : '#991b1b',
+                                      cursor: scene.audioUrl ? 'pointer' : 'default',
+                                      '&:hover': scene.audioUrl ? {
+                                        bgcolor: '#a7f3d0',
+                                      } : {},
+                                    }}
+                                  />
+                                </Tooltip>
+                                {/* Status Indicator */}
+                                {completed && (
+                                  <Chip 
+                                    icon={<CheckCircle sx={{ fontSize: 14 }} />}
+                                    label="Video Ready" 
+                                    size="small" 
+                                    color="success"
+                                    sx={{ fontWeight: 600, fontSize: '0.75rem' }}
+                                  />
+                                )}
+                                {failed && (
+                                  <Chip 
+                                    label="Failed" 
+                                    size="small" 
+                                    color="error"
+                                    sx={{ fontWeight: 600, fontSize: '0.75rem' }}
+                                  />
+                                )}
+                              </Stack>
+                            </Box>
+
+                            {/* Action Buttons */}
+                            <Stack direction="row" spacing={1} alignItems="center">
+                              {running && st.progress > 0 && st.progress < 100 && (
+                                <Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
+                                  <CircularProgress 
+                                    size={32} 
+                                    variant="determinate" 
+                                    value={Math.min(100, st.progress)} 
+                                    sx={{ color: '#667eea' }}
+                                  />
+                                  <Typography variant="caption" sx={{ color: '#64748b', fontWeight: 600 }}>
+                                    {Math.round(st.progress)}%
+                                  </Typography>
+                                </Box>
+                              )}
+                              {hasAssets && (
+                                <Tooltip title="Preview scene assets" arrow>
+                                  <IconButton
+                                    size="small"
+                                    onClick={() => handlePreviewScene(scene)}
+                                    sx={{
+                                      color: '#667eea',
+                                      '&:hover': {
+                                        bgcolor: '#eff6ff',
+                                      },
+                                    }}
+                                  >
+                                    <Visibility />
+                                  </IconButton>
+                                </Tooltip>
+                              )}
+                              <Button
+                                variant={completed ? "outlined" : "contained"}
+                                color={completed ? "success" : "primary"}
+                                onClick={() => runSceneVideo(scene)}
+                                disabled={!hasAssets || running}
+                                startIcon={running ? <CircularProgress size={16} sx={{ color: 'white' }} /> : undefined}
+                                sx={{ 
+                                  textTransform: 'none', 
+                                  fontWeight: 700,
+                                  minWidth: 120,
+                                  px: 2.5,
+                                }}
+                              >
+                                {running ? 'Generating' : failed ? 'Retry Video' : completed ? 'Regenerate' : 'Generate Video'}
+                              </Button>
+                            </Stack>
+                          </Box>
+
+                          {/* Progress/Error Message */}
+                          {st.status !== 'idle' && st.status !== 'completed' && (
+                            <Box 
+                              sx={{ 
+                                px: 2, 
+                                py: 1, 
+                                bgcolor: failed ? '#fef2f2' : '#f8fafc',
+                                borderRadius: 1,
+                                border: `1px solid ${failed ? '#fecaca' : '#e2e8f0'}`,
+                              }}
+                            >
+                              <Typography 
+                                variant="body2" 
+                                sx={{ 
+                                  color: failed ? '#991b1b' : '#475569',
+                                  fontSize: '0.875rem',
+                                  fontWeight: 500,
+                                }}
+                              >
+                                {running 
+                                  ? `Generating video... This may take 1-2 minutes.`
+                                  : failed
+                                    ? `❌ ${st.error || 'Generation failed. Please retry.'}`
+                                    : 'Processing...'}
+                              </Typography>
+                            </Box>
+                          )}
+                        </Stack>
+                      </Paper>
+                    );
+                  })}
+                </Stack>
+              </Box>
+            )}
+
+            {/* Combine Scene Videos (Optional) */}
+            {combineScenes && scenes.filter(s => s.enabled !== false && s.videoUrl).length >= 2 && (
+              <Box sx={{ mb: 3, p: 2.5, bgcolor: '#f0fdf4', borderRadius: 2, border: '2px solid #10b981' }}>
+                <Typography variant="h6" sx={{ mb: 1, fontWeight: 600, color: '#065f46' }}>
+                  🎞️ Combine Scene Videos
+                </Typography>
+                <Typography variant="body2" color="text.secondary" sx={{ mb: 2 }}>
+                  All scene videos are ready! Combine them into one final video.
+                </Typography>
+                <Stack direction="row" spacing={2} alignItems="center">
+                  <Button
+                    variant="contained"
+                    color="success"
+                    onClick={combineVideos}
+                    disabled={combining}
+                    startIcon={combining ? <CircularProgress size={20} sx={{ color: 'white' }} /> : undefined}
+                    sx={{ textTransform: 'none', fontWeight: 700 }}
+                  >
+                    {combining ? 'Combining Videos...' : 'Combine Into Final Video'}
+                  </Button>
+                  {combining && (
+                    <Typography variant="body2" color="text.secondary">
+                      {combiningMessage} ({combiningProgress.toFixed(0)}%)
+                    </Typography>
+                  )}
+                  {finalVideoUrl && (
+                    <Chip 
+                      label="✅ Final video ready" 
+                      color="success"
+                      sx={{ fontWeight: 600 }}
                    />
-                  ))}
+                  )}
                </Stack>
              </Box>
            )}
@@ -140,42 +389,60 @@ export const RenderStep: React.FC<RenderStepProps> = React.memo(({
              onCombineScenesChange={onCombineScenesChange}
            />

-            {/* Render Summary and Cost Estimate */}
-            <Box sx={{ p: 2, bgcolor: '#f4f4f4', borderRadius: 1, border: `1px solid ${YT_BORDER}` }}>
-              <Typography variant="subtitle2" sx={{ mb: 1, fontWeight: 600 }}>
-                Render Summary
-              </Typography>
-              <Typography variant="body2" color="text.secondary" sx={{ mb: 2 }}>
-                • {enabledScenesCount} scenes will be rendered
-                <br />
-                • Resolution: {resolution}
-                <br />
-                • {combineScenes ? 'Scenes will be combined into one video' : 'Each scene will be a separate video'}
-                <br />
-              </Typography>
-
-              <CostEstimateCard
-                costEstimate={costEstimate}
-                loadingCostEstimate={loadingCostEstimate}
-              />
-            </Box>
+            {/* Cost Estimate */}
+            <CostEstimateCard
+              costEstimate={costEstimate}
+              loadingCostEstimate={loadingCostEstimate}
+              scenes={scenes}
+            />

            {/* Action Buttons */}
            <Box sx={{ display: 'flex', gap: 2 }}>
-              <Button variant="outlined" onClick={onBack}>
-                Back to Scenes
-              </Button>
-              <Button
-                variant="contained"
-                color="error"
-                size="large"
-                onClick={onStartRender}
-                disabled={loading || enabledScenesCount === 0}
-                startIcon={loading ? <CircularProgress size={20} /> : <PlayArrow />}
-                sx={{ px: 4 }}
+              <Button 
+                variant="outlined" 
+                startIcon={<ArrowBack />}
+                onClick={onBack}
              >
-                {loading ? 'Starting Render...' : 'Start Video Render'}
+                Back to Assets
              </Button>
+              <Tooltip
+                title={
+                  enabledScenesCount === 0
+                    ? "Please enable at least one scene"
+                    : loading
+                    ? "Video render in progress"
+                    : `Generate videos for ${enabledScenesCount} scene${enabledScenesCount !== 1 ? 's' : ''}. Estimated cost includes video generation and processing.`
+                }
+                arrow
+                placement="top"
+              >
+                <span>
+                  <Button
+                    variant="contained"
+                    color="error"
+                    size="large"
+                    startIcon={<PlayArrow />}
+                    onClick={onStartRender}
+                    disabled={loading || enabledScenesCount === 0}
+                    sx={{
+                      px: 4,
+                      fontWeight: 600,
+                      '&:disabled': {
+                        opacity: 0.6,
+                      },
+                    }}
+                  >
+                    {loading ? (
+                      <>
+                        Rendering...
+                        <CircularProgress size={16} sx={{ ml: 1 }} color="inherit" />
+                      </>
+                    ) : (
+                      `Start Video Render ${costEstimate?.total_cost ? `($${costEstimate.total_cost.toFixed(2)})` : ''}`
+                    )}
+                  </Button>
+                </span>
+              </Tooltip>
            </Box>
          </Stack>
        ) : (
@@ -188,6 +455,37 @@ export const RenderStep: React.FC<RenderStepProps> = React.memo(({
          />
        )}
      </Paper>
+      <Snackbar
+        open={snackbar.open}
+        autoHideDuration={4000}
+        onClose={() => setSnackbar((s) => ({ ...s, open: false }))}
+        anchorOrigin={{ vertical: 'bottom', horizontal: 'center' }}
+      >
+        <MuiAlert
+          onClose={() => setSnackbar((s) => ({ ...s, open: false }))}
+          severity={snackbar.severity}
+          elevation={6}
+          variant="filled"
+          sx={{ width: '100%' }}
+        >
+          {snackbar.message}
+        </MuiAlert>
+      </Snackbar>
+
+      {/* Scene Preview Modal */}
+      {previewScene && (
+        <ScenePreviewModal
+          open={previewModalOpen}
+          onClose={() => {
+            setPreviewModalOpen(false);
+            setPreviewScene(null);
+          }}
+          sceneTitle={previewScene.title}
+          sceneNumber={previewScene.scene_number}
+          imageUrl={previewScene.imageUrl}
+          audioUrl={previewScene.audioUrl}
+        />
+      )}
    </motion.div>
  );
 });
--- a/frontend/src/components/YouTubeCreator/components/SceneCard.tsx
+++ b/frontend/src/components/YouTubeCreator/components/SceneCard.tsx
@@ -1,26 +1,51 @@
 /**
 * Scene Card Component
+ *
+ * Displays a YouTube scene with editing, generation, and media display capabilities.
+ * Refactored for reusability and maintainability following React best practices.
 */

-import React from 'react';
+import React, { useCallback, useEffect } from 'react';
 import {
  Card,
  CardContent,
-  Typography,
  Stack,
-  Chip,
  Box,
-  FormControlLabel,
-  Switch,
-  IconButton,
-  TextField,
-  Button,
-  Tooltip,
-  Alert,
 } from '@mui/material';
-import { Edit, Check, Close, Movie, Shuffle, CallMade, ArrowForward, HelpOutline, Info, RecordVoiceOver, Videocam, AutoAwesome } from '@mui/icons-material';
 import { Scene } from '../../../services/youtubeApi';
-import { inputSx, labelSx } from '../styles';
+import { AudioGenerationSettings } from '../../../components/shared/AudioSettingsModal';
+import { YouTubeImageGenerationSettings } from '../shared/YouTubeImageGenerationModal';
+
+// Custom hooks
+import { useSceneMedia } from '../hooks/useSceneMedia';
+import { useGenerationState } from '../hooks/useGenerationState';
+
+// Sub-components
+import { SceneHeader } from './SceneCard/SceneHeader';
+import { SceneContent } from './SceneCard/SceneContent';
+import { SceneEditForm } from './SceneCard/SceneEditForm';
+import { GenerationButtons } from './SceneCard/GenerationButtons';
+import { GenerationModals } from './SceneCard/GenerationModals';
+import { InfoAlert } from './SceneCard/InfoAlert';
+
+// Types
+interface SceneCardProps {
+  scene: Scene;
+  isEditing: boolean;
+  editedScene: Partial<Scene> | null;
+  onToggle: (sceneNumber: number) => void;
+  onEdit: (scene: Scene) => void;
+  onSave: () => void;
+  onCancel: () => void;
+  onEditChange: (updates: Partial<Scene>) => void;
+  onGenerateImage?: (scene: Scene, imageSettings?: YouTubeImageGenerationSettings) => Promise<void>;
+  generatingImage?: boolean;
+  onGenerateAudio?: (scene: Scene, audioSettings?: AudioGenerationSettings) => Promise<void>;
+  generatingAudio?: boolean;
+  loading: boolean;
+  avatarUrl?: string | null;
+  videoPlanIdea?: string;
+}

 interface SceneCardProps {
  scene: Scene;
@@ -31,7 +56,13 @@ interface SceneCardProps {
  onSave: () => void;
  onCancel: () => void;
  onEditChange: (updates: Partial<Scene>) => void;
+  onGenerateImage?: (scene: Scene, imageSettings?: YouTubeImageGenerationSettings) => Promise<void>;
+  generatingImage?: boolean;
+  onGenerateAudio?: (scene: Scene, audioSettings?: AudioGenerationSettings) => Promise<void>;
+  generatingAudio?: boolean;
  loading: boolean;
+  avatarUrl?: string | null; // Base avatar URL for character consistency
+  videoPlanIdea?: string; // Video plan idea for context
 }

 // Helper function to get border color based on scene emphasis
@@ -52,33 +83,6 @@ const getSceneBorderColor = (emphasisTags?: string[]): string => {
  }
 };

-// Helper function to get icon for scene emphasis
-const getSceneIcon = (emphasisTag: string) => {
-  switch (emphasisTag) {
-    case 'hook':
-      return <Movie fontSize="small" />;
-    case 'cta':
-      return <CallMade fontSize="small" />;
-    case 'transition':
-      return <Shuffle fontSize="small" />;
-    case 'main_content':
-      return <ArrowForward fontSize="small" />;
-    default:
-      return <ArrowForward fontSize="small" />;
-  }
-};
-
-// Helper function to get color for scene emphasis
-const getSceneChipColor = (emphasisTag: string): 'primary' | 'secondary' | 'default' => {
-  switch (emphasisTag) {
-    case 'hook':
-      return 'primary';
-    case 'cta':
-      return 'secondary';
-    default:
-      return 'default';
-  }
-};

 export const SceneCard: React.FC<SceneCardProps> = React.memo(({
  scene,
@@ -89,416 +93,350 @@ export const SceneCard: React.FC<SceneCardProps> = React.memo(({
  onSave,
  onCancel,
  onEditChange,
+  onGenerateImage,
+  generatingImage = false,
+  onGenerateAudio,
+  generatingAudio = false,
  loading,
+  avatarUrl,
+  videoPlanIdea,
 }) => {
  const sceneData = isEditing && editedScene ? { ...scene, ...editedScene } : scene;
+
+  // Custom hooks
+  const { imageBlobUrl, imageLoading, audioBlobUrl, audioLoading } = useSceneMedia({
+    imageUrl: sceneData.imageUrl,
+    audioUrl: sceneData.audioUrl,
+  });
+
+  // Debug logging
+  React.useEffect(() => {
+    console.log('[SceneCard] Render', {
+      sceneNumber: scene.scene_number,
+      imageUrl: scene.imageUrl,
+      hasImageBlobUrl: !!imageBlobUrl,
+      imageLoading,
+      generatingImage,
+    });
+  });
+
+  const {
+    showAudioSettingsModal,
+    setShowAudioSettingsModal,
+    showImageSettingsModal,
+    setShowImageSettingsModal,
+    currentAudioSettings,
+    setCurrentAudioSettings,
+    imageGenerationProgress,
+    setImageGenerationProgress,
+    imageGenerationStatus,
+    setImageGenerationStatus,
+    audioGenerationProgress,
+    setAudioGenerationProgress,
+    audioGenerationStatus,
+    setAudioGenerationStatus,
+    resetImageGeneration,
+    resetAudioGeneration,
+  } = useGenerationState();
+
+  // Sync local status with parent's generating state
+  useEffect(() => {
+    if (generatingImage && imageGenerationStatus === '') {
+      setImageGenerationStatus('Generating image...');
+      setImageGenerationProgress(50);
+    } else if (!generatingImage && imageGenerationStatus.includes('Generating')) {
+      // Generation process finished (either success or failure)
+      if (sceneData.imageUrl) {
+        // Generation completed successfully
+        setImageGenerationStatus('Image generated successfully!');
+        setImageGenerationProgress(100);
+        setTimeout(() => resetImageGeneration(), 3000);
+      } else {
+        // Check if this is a new imageUrl that just arrived (race condition)
+        const checkForImageUrl = () => {
+          if (sceneData.imageUrl) {
+            setImageGenerationStatus('Image generated successfully!');
+            setImageGenerationProgress(100);
+            setTimeout(() => resetImageGeneration(), 3000);
+          } else {
+            // Still no imageUrl, assume failure
+            setImageGenerationStatus('Failed to generate image');
+            setImageGenerationProgress(0);
+            setTimeout(() => resetImageGeneration(), 3000);
+          }
+        };
+        // Wait a moment for potential race condition resolution
+        setTimeout(checkForImageUrl, 500);
+      }
+    }
+  }, [generatingImage, imageGenerationStatus, sceneData.imageUrl, setImageGenerationStatus, setImageGenerationProgress, resetImageGeneration]);
+
+  useEffect(() => {
+    if (generatingAudio && audioGenerationStatus === '') {
+      setAudioGenerationStatus('Generating audio...');
+      setAudioGenerationProgress(50);
+    } else if (!generatingAudio && audioGenerationStatus.includes('Generating')) {
+      // Generation process finished (either success or failure)
+      if (sceneData.audioUrl) {
+        // Generation completed successfully
+        setAudioGenerationStatus('Audio generated successfully!');
+        setAudioGenerationProgress(100);
+        setTimeout(() => resetAudioGeneration(), 2000);
+      } else {
+        // Check if this is a new audioUrl that just arrived (race condition)
+        const checkForAudioUrl = () => {
+          if (sceneData.audioUrl) {
+            setAudioGenerationStatus('Audio generated successfully!');
+            setAudioGenerationProgress(100);
+            setTimeout(() => resetAudioGeneration(), 2000);
+          } else {
+            // Still no audioUrl, assume failure
+            setAudioGenerationStatus('Failed to generate audio');
+            setAudioGenerationProgress(0);
+            setTimeout(() => resetAudioGeneration(), 2000);
+          }
+        };
+        // Wait a moment for potential race condition resolution
+        setTimeout(checkForAudioUrl, 500);
+      }
+    }
+  }, [generatingAudio, audioGenerationStatus, sceneData.audioUrl, setAudioGenerationStatus, setAudioGenerationProgress, resetAudioGeneration]);
+
+  console.log('[SceneCard] Render', {
+    sceneNumber: scene.scene_number,
+    imageUrl: scene.imageUrl,
+    generatingImage,
+    hasImageBlobUrl: !!imageBlobUrl,
+    imageLoading
+  });
+
+  // Reset local generation state when parent indicates generation is complete
+  useEffect(() => {
+    if (!generatingImage) {
+      resetImageGeneration();
+    }
+  }, [generatingImage, resetImageGeneration]);
+
+  useEffect(() => {
+    if (!generatingAudio) {
+      resetAudioGeneration();
+    }
+  }, [generatingAudio, resetAudioGeneration]);
+
+  // Border color based on scene emphasis
+  const getSceneBorderColor = (emphasisTags?: string[]): string => {
+    if (!emphasisTags || emphasisTags.length === 0) return '#e5e7eb';
+    const primaryTag = emphasisTags[0];
+    switch (primaryTag) {
+      case 'hook': return '#3b82f6';
+      case 'cta': return '#8b5cf6';
+      case 'transition': return '#10b981';
+      default: return '#e5e7eb';
+    }
+  };
  const borderColor = getSceneBorderColor(sceneData.emphasis_tags);

+  // Event handlers
+  const handleAudioModalOpen = useCallback(() => {
+    if (!onGenerateAudio || generatingAudio || loading) return;
+    console.log('[SceneCard] Opening audio settings modal for scene', scene.scene_number);
+    setShowAudioSettingsModal(true);
+  }, [onGenerateAudio, generatingAudio, loading, scene.scene_number, setShowAudioSettingsModal]);
+
+  const handleImageModalOpen = useCallback(() => {
+    if (!onGenerateImage || generatingImage || loading) return;
+    console.log('[SceneCard] Opening image settings modal for scene', scene.scene_number);
+    setShowImageSettingsModal(true);
+  }, [onGenerateImage, generatingImage, loading, scene.scene_number, setShowImageSettingsModal]);
+
+  const handleImageSettingsApply = useCallback(async (settings: YouTubeImageGenerationSettings) => {
+    console.log('[SceneCard] Applying image settings for scene', scene.scene_number, 'with settings:', settings);
+
+    if (!onGenerateImage) {
+      console.error('[SceneCard] onGenerateImage handler is not provided');
+      return;
+    }
+
+    if (generatingImage || loading) {
+      console.warn('[SceneCard] Image generation already in progress, ignoring click');
+      return;
+    }
+
+    setShowImageSettingsModal(false);
+
+    try {
+      setImageGenerationStatus('Starting image generation...');
+      setImageGenerationProgress(5);
+
+      console.log('[SceneCard] Calling onGenerateImage for scene', scene.scene_number, 'with settings');
+      await onGenerateImage(scene, settings);
+      console.log('[SceneCard] onGenerateImage task started for scene', scene.scene_number);
+
+      // Don't assume success here - the parent component will handle polling
+      // and update the generatingImage prop when the task actually completes
+      setImageGenerationStatus('Image generation in progress...');
+      setImageGenerationProgress(25);
+
+    } catch (error: any) {
+      const errorMessage = error?.response?.data?.detail?.message
+        || error?.response?.data?.detail?.error
+        || error?.response?.data?.detail
+        || error?.message
+        || 'Failed to start image generation. Please try again.';
+
+      setImageGenerationStatus(`Error: ${errorMessage}`);
+      setImageGenerationProgress(0);
+
+      setTimeout(() => resetImageGeneration(), 3000);
+    }
+  }, [onGenerateImage, generatingImage, loading, scene, setShowImageSettingsModal, setImageGenerationStatus, setImageGenerationProgress, resetImageGeneration]);
+
+  const handleAudioSettingsApply = useCallback(async (settings: AudioGenerationSettings) => {
+    console.log('[SceneCard] Applying audio settings for scene', scene.scene_number, 'with settings:', settings);
+
+    setCurrentAudioSettings(settings);
+    setShowAudioSettingsModal(false);
+
+    const startTime = Date.now();
+    let progressInterval: NodeJS.Timeout | null = null;
+
+    try {
+      setAudioGenerationStatus('Submitting audio generation request...');
+      setAudioGenerationProgress(10);
+
+      progressInterval = setInterval(() => {
+        const elapsed = Date.now() - startTime;
+        const seconds = Math.floor(elapsed / 1000);
+
+        if (seconds < 3) {
+          setAudioGenerationStatus('Submitting request to AI service...');
+          setAudioGenerationProgress(15);
+        } else if (seconds < 10) {
+          setAudioGenerationStatus('AI is generating your audio...');
+          setAudioGenerationProgress(40);
+        } else if (seconds < 20) {
+          setAudioGenerationStatus('Synthesizing narration...');
+          setAudioGenerationProgress(70);
+        } else {
+          setAudioGenerationStatus(`Processing... (${seconds}s elapsed)`);
+          setAudioGenerationProgress(Math.min(90, 70 + (seconds - 20) / 2));
+        }
+      }, 1000);
+
+      await onGenerateAudio!(scene, settings);
+      console.log('[SceneCard] Audio generation completed for scene', scene.scene_number);
+
+      if (progressInterval) {
+        clearInterval(progressInterval);
+        progressInterval = null;
+      }
+
+      setAudioGenerationStatus('Finalizing audio...');
+      setAudioGenerationProgress(95);
+
+      const elapsed = Math.floor((Date.now() - startTime) / 1000);
+      setAudioGenerationStatus(`Audio generated successfully in ${elapsed}s`);
+      setAudioGenerationProgress(100);
+
+      setTimeout(() => resetAudioGeneration(), 2000);
+    } catch (error: any) {
+      if (progressInterval) {
+        clearInterval(progressInterval);
+        progressInterval = null;
+      }
+
+      const errorMessage = error?.response?.data?.detail?.message
+        || error?.response?.data?.detail?.error
+        || error?.response?.data?.detail
+        || error?.message
+        || 'Failed to generate audio. Please try again.';
+
+      setAudioGenerationStatus(`Error: ${errorMessage}`);
+      setAudioGenerationProgress(0);
+    }
+  }, [scene, setCurrentAudioSettings, setShowAudioSettingsModal, setAudioGenerationStatus, setAudioGenerationProgress, onGenerateAudio, resetAudioGeneration]);
+
  return (
-    <Card
-      variant="outlined"
-      sx={{
-        opacity: sceneData.enabled === false ? 0.6 : 1,
-        border: sceneData.enabled === false ? '1px dashed #e5e7eb' : `2px solid ${borderColor}`,
-        borderRadius: 2,
-        bgcolor: '#ffffff',
-        transition: 'all 0.2s ease-in-out',
-        '&:hover': {
-          boxShadow: sceneData.enabled !== false ? '0 4px 12px rgba(0, 0, 0, 0.1)' : 'none',
-        },
-      }}
-    >
-      <CardContent sx={{ p: 3 }}>
-        {/* Header Section */}
-        <Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'flex-start', mb: 2.5 }}>
-          <Box sx={{ flexGrow: 1 }}>
-            <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1.5 }}>
-              <Typography 
-                variant="h6" 
-                sx={{ 
-                  mb: 0,
-                  fontWeight: 700,
-                  fontSize: '1.125rem',
-                  color: '#111827',
-                  letterSpacing: '-0.01em',
-                }}
-              >
-                Scene {scene.scene_number}: {sceneData.title}
-              </Typography>
-              <Tooltip
-                title={
-                  <Box>
-                    <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
-                      Scene Type: {sceneData.emphasis_tags?.[0]?.replace('_', ' ') || 'Main Content'}
-                    </Typography>
-                    <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
-                      {sceneData.emphasis_tags?.[0] === 'hook' 
-                        ? 'Hook scenes capture attention in the first few seconds with compelling visuals or statements.'
-                        : sceneData.emphasis_tags?.[0] === 'cta'
-                        ? 'Call-to-action scenes encourage viewers to like, subscribe, or take a specific action.'
-                        : sceneData.emphasis_tags?.[0] === 'transition'
-                        ? 'Transition scenes smoothly connect different topics or segments.'
-                        : 'Main content scenes deliver the core message and information.'}
-                    </Typography>
-                    <Typography variant="caption" sx={{ display: 'block' }}>
-                      Duration: {sceneData.duration_estimate}s • This affects rendering cost.
-                    </Typography>
-                  </Box>
-                }
-                arrow
-                placement="top"
-              >
-                <IconButton size="small" sx={{ color: '#6b7280', p: 0.5 }}>
-                  <HelpOutline fontSize="small" />
-                </IconButton>
-              </Tooltip>
-            </Box>
-            <Stack direction="row" spacing={1} sx={{ mb: 0 }} flexWrap="wrap" useFlexGap>
-              {sceneData.emphasis_tags?.map((tag) => (
-                <Tooltip
-                  key={tag}
-                  title={
-                    tag === 'hook'
-                      ? 'Hook: Grabs viewer attention immediately'
-                      : tag === 'cta'
-                      ? 'CTA: Encourages viewer action'
-                      : tag === 'transition'
-                      ? 'Transition: Connects segments smoothly'
-                      : 'Main Content: Core message delivery'
-                  }
-                  arrow
-                >
-                  <Chip
-                    label={tag.replace('_', ' ')}
-                    size="small"
-                    color={getSceneChipColor(tag)}
-                    icon={getSceneIcon(tag)}
-                    sx={{
-                      textTransform: 'capitalize',
-                      fontWeight: 600,
-                      fontSize: '0.75rem',
-                    }}
-                  />
-                </Tooltip>
-              ))}
-              <Tooltip
-                title="Estimated duration in seconds. Longer scenes cost more to render but provide more detail."
-                arrow
-              >
-                <Chip
-                  label={`~${sceneData.duration_estimate}s`}
-                  size="small"
-                  variant="outlined"
-                  sx={{ 
-                    ml: 'auto',
-                    fontWeight: 600,
-                    fontSize: '0.75rem',
-                    borderColor: '#d1d5db',
-                    color: '#374151',
-                  }}
-                />
-              </Tooltip>
-            </Stack>
-          </Box>
-          <Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
-            <Tooltip
-              title={
-                sceneData.enabled !== false
-                  ? 'Disable this scene to exclude it from rendering and reduce cost'
-                  : 'Enable this scene to include it in the final video'
-              }
-              arrow
-            >
-              <FormControlLabel
-                control={
-                  <Switch
-                    checked={sceneData.enabled !== false}
-                    onChange={() => onToggle(scene.scene_number)}
-                    size="small"
-                  />
-                }
-                label="Enable"
-                sx={{ mr: 0 }}
+    <>
+      <Card
+        variant="outlined"
+        sx={{
+          opacity: sceneData.enabled === false ? 0.6 : 1,
+          border: sceneData.enabled === false ? '1px dashed #e5e7eb' : `2px solid ${borderColor}`,
+          borderRadius: 2,
+          bgcolor: '#ffffff',
+          transition: 'all 0.2s ease-in-out',
+          '&:hover': {
+            boxShadow: sceneData.enabled !== false ? '0 4px 12px rgba(0, 0, 0, 0.1)' : 'none',
+          },
+        }}
+      >
+        <CardContent sx={{ p: 3 }}>
+          <SceneHeader
+            scene={scene}
+            isEditing={isEditing}
+            onToggle={onToggle}
+            onEdit={onEdit}
+          />
+
+          {isEditing ? (
+            <SceneEditForm
+              scene={scene}
+              editedScene={editedScene || {}}
+              onEditChange={onEditChange}
+              onSave={onSave}
+              onCancel={onCancel}
+              loading={loading}
+            />
+          ) : (
+            <>
+              <SceneContent
+                scene={scene}
+                imageBlobUrl={imageBlobUrl}
+                imageLoading={imageLoading}
+                audioBlobUrl={audioBlobUrl}
+                audioLoading={audioLoading}
              />
-            </Tooltip>
-            {!isEditing && (
-              <Tooltip title="Edit scene narration, visual prompt, or duration" arrow>
-                <IconButton
-                  size="small"
-                  onClick={() => onEdit(scene)}
-                  color="primary"
-                  sx={{
-                    border: '1px solid #e5e7eb',
-                    '&:hover': {
-                      bgcolor: '#f9fafb',
-                    },
-                  }}
-                >
-                  <Edit fontSize="small" />
-                </IconButton>
-              </Tooltip>
-            )}
-          </Box>
-        </Box>

-        {isEditing ? (
-          <Stack spacing={2}>
-            <TextField
-              label="Narration"
-              value={sceneData.narration}
-              onChange={(e) => onEditChange({ narration: e.target.value })}
-              multiline
-              rows={3}
-              fullWidth
-              sx={inputSx}
-              InputLabelProps={{ sx: labelSx }}
-            />
-            <TextField
-              label="Visual Prompt"
-              value={sceneData.visual_prompt}
-              onChange={(e) => onEditChange({ visual_prompt: e.target.value })}
-              multiline
-              rows={2}
-              fullWidth
-              sx={inputSx}
-              InputLabelProps={{ sx: labelSx }}
-            />
-            <TextField
-              label="Duration (seconds)"
-              type="number"
-              value={sceneData.duration_estimate}
-              onChange={(e) => onEditChange({ duration_estimate: parseFloat(e.target.value) || 5 })}
-              inputProps={{ min: 1, max: 10, step: 0.5 }}
-              fullWidth
-              sx={inputSx}
-              InputLabelProps={{ sx: labelSx }}
-            />
-            <Box sx={{ display: 'flex', gap: 1 }}>
-              <Button
-                size="small"
-                variant="contained"
-                startIcon={<Check />}
-                onClick={onSave}
-                disabled={loading}
-              >
-                Save
-              </Button>
-              <Button
-                size="small"
-                variant="outlined"
-                startIcon={<Close />}
-                onClick={onCancel}
-              >
-                Cancel
-              </Button>
-            </Box>
-          </Stack>
-        ) : (
-          <Stack spacing={2.5}>
-            {/* Narration Section */}
-            <Box
-              sx={{
-                p: 2,
-                bgcolor: '#f9fafb',
-                borderRadius: 1.5,
-                border: '1px solid #e5e7eb',
-              }}
-            >
-              <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1 }}>
-                <RecordVoiceOver sx={{ color: '#6366f1', fontSize: 18 }} />
-                <Typography
-                  variant="subtitle2"
-                  sx={{
-                    fontWeight: 600,
-                    fontSize: '0.875rem',
-                    color: '#111827',
-                    textTransform: 'uppercase',
-                    letterSpacing: '0.05em',
-                  }}
-                >
-                  Narration
-                </Typography>
-                <Tooltip
-                  title="The spoken text or voiceover for this scene. This is what will be narrated in the final video."
-                  arrow
-                >
-                  <IconButton size="small" sx={{ color: '#6b7280', p: 0.25, ml: 0.5 }}>
-                    <HelpOutline fontSize="small" />
-                  </IconButton>
-                </Tooltip>
-              </Box>
-              <Typography
-                variant="body1"
-                sx={{
-                  fontStyle: 'italic',
-                  color: '#374151',
-                  fontSize: '0.9375rem',
-                  lineHeight: 1.7,
-                  fontWeight: 400,
-                  pl: 0.5,
-                }}
-              >
-                "{sceneData.narration}"
-              </Typography>
-            </Box>
+              <GenerationButtons
+                scene={scene}
+                isEditing={isEditing}
+                loading={loading}
+                onGenerateImage={onGenerateImage}
+                generatingImage={generatingImage}
+                onGenerateAudio={onGenerateAudio}
+                generatingAudio={generatingAudio}
+                imageGenerationStatus={imageGenerationStatus}
+                imageGenerationProgress={imageGenerationProgress}
+                audioGenerationStatus={audioGenerationStatus}
+                audioGenerationProgress={audioGenerationProgress}
+                onAudioModalOpen={handleAudioModalOpen}
+                onImageModalOpen={handleImageModalOpen}
+              />

-            {/* Visual Prompt Section */}
-            <Box
-              sx={{
-                p: 2,
-                bgcolor: '#fef3c7',
-                borderRadius: 1.5,
-                border: '1px solid #fde68a',
-              }}
-            >
-              <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1 }}>
-                <Videocam sx={{ color: '#d97706', fontSize: 18 }} />
-                <Typography
-                  variant="subtitle2"
-                  sx={{
-                    fontWeight: 600,
-                    fontSize: '0.875rem',
-                    color: '#92400e',
-                    textTransform: 'uppercase',
-                    letterSpacing: '0.05em',
-                  }}
-                >
-                  Visual Prompt
-                </Typography>
-                <Tooltip
-                  title={
-                    <Box>
-                      <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
-                        Visual Prompt Explained
-                      </Typography>
-                      <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
-                        This describes the visual content that will be generated for this scene. The AI uses this to create appropriate images or video clips.
-                      </Typography>
-                      <Typography variant="caption" sx={{ display: 'block' }}>
-                        <strong>Tip:</strong> More detailed prompts lead to better visual results. Include camera angles, lighting, and composition details.
-                      </Typography>
-                    </Box>
-                  }
-                  arrow
-                >
-                  <IconButton size="small" sx={{ color: '#d97706', p: 0.25, ml: 0.5 }}>
-                    <HelpOutline fontSize="small" />
-                  </IconButton>
-                </Tooltip>
-              </Box>
-              <Typography
-                variant="body2"
-                sx={{
-                  color: '#78350f',
-                  fontSize: '0.875rem',
-                  lineHeight: 1.7,
-                  pl: 0.5,
-                  fontWeight: 400,
-                }}
-              >
-                {sceneData.visual_prompt}
-              </Typography>
-            </Box>
+              <InfoAlert
+                scene={scene}
+                isEditing={isEditing}
+                onGenerateImage={!!onGenerateImage}
+                onGenerateAudio={!!onGenerateAudio}
+              />
+            </>
+          )}
+        </CardContent>
+      </Card>

-            {/* Visual Cues Section */}
-            {sceneData.visual_cues && sceneData.visual_cues.length > 0 && (
-              <Box
-                sx={{
-                  p: 2,
-                  bgcolor: '#f0f9ff',
-                  borderRadius: 1.5,
-                  border: '1px solid #bae6fd',
-                }}
-              >
-                <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1.5 }}>
-                  <AutoAwesome sx={{ color: '#0284c7', fontSize: 18 }} />
-                  <Typography
-                    variant="subtitle2"
-                    sx={{
-                      fontWeight: 600,
-                      fontSize: '0.875rem',
-                      color: '#0c4a6e',
-                      textTransform: 'uppercase',
-                      letterSpacing: '0.05em',
-                    }}
-                  >
-                    Visual Cues
-                  </Typography>
-                  <Tooltip
-                    title={
-                      <Box>
-                        <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
-                          Visual Cues Explained
-                        </Typography>
-                        <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
-                          These are specific visual effects, camera techniques, or stylistic elements that will be applied to enhance the scene.
-                        </Typography>
-                        <Typography variant="caption" sx={{ display: 'block' }}>
-                          Examples: Quick Zoom, Sunlight Flare, Energetic Cut, Steady Cam Walk, etc.
-                        </Typography>
-                      </Box>
-                    }
-                    arrow
-                  >
-                    <IconButton size="small" sx={{ color: '#0284c7', p: 0.25, ml: 0.5 }}>
-                      <HelpOutline fontSize="small" />
-                    </IconButton>
-                  </Tooltip>
-                </Box>
-                <Stack direction="row" spacing={0.75} flexWrap="wrap" useFlexGap>
-                  {sceneData.visual_cues.map((cue, idx) => (
-                    <Tooltip
-                      key={`${cue}-${idx}`}
-                      title={`Visual effect: ${cue}`}
-                      arrow
-                    >
-                      <Chip
-                        label={cue}
-                        size="small"
-                        sx={{
-                          fontSize: '0.75rem',
-                          height: 28,
-                          textTransform: 'capitalize',
-                          borderColor: '#7dd3fc',
-                          bgcolor: '#ffffff',
-                          color: '#0c4a6e',
-                          fontWeight: 500,
-                          '&:hover': {
-                            bgcolor: '#e0f2fe',
-                            borderColor: '#0284c7',
-                          },
-                        }}
-                      />
-                    </Tooltip>
-                  ))}
-                </Stack>
-              </Box>
-            )}
-
-            {/* Info Alert for Editing */}
-            <Alert
-              severity="info"
-              icon={<Info fontSize="small" />}
-              sx={{
-                bgcolor: '#eff6ff',
-                border: '1px solid #bfdbfe',
-                '& .MuiAlert-icon': {
-                  color: '#3b82f6',
-                },
-                '& .MuiAlert-message': {
-                  color: '#1e40af',
-                },
-              }}
-            >
-              <Typography variant="caption" sx={{ fontSize: '0.75rem', lineHeight: 1.5 }}>
-                <strong>Tip:</strong> Click the edit icon above to modify narration, visual prompt, or duration. 
-                Disable scenes you don't need to reduce rendering cost.
-              </Typography>
-            </Alert>
-          </Stack>
-        )}
-      </CardContent>
-    </Card>
+      <GenerationModals
+        scene={scene}
+        showAudioSettingsModal={showAudioSettingsModal}
+        setShowAudioSettingsModal={setShowAudioSettingsModal}
+        showImageSettingsModal={showImageSettingsModal}
+        setShowImageSettingsModal={setShowImageSettingsModal}
+        currentAudioSettings={currentAudioSettings}
+        onAudioSettingsApply={handleAudioSettingsApply}
+        onImageSettingsApply={handleImageSettingsApply}
+        generatingAudio={generatingAudio}
+      />
+    </>
  );
 });

--- a/frontend/src/components/YouTubeCreator/components/SceneCard/GenerationButtons.tsx
+++ b/frontend/src/components/YouTubeCreator/components/SceneCard/GenerationButtons.tsx
@@ -0,0 +1,164 @@
+import React from 'react';
+import {
+  Box,
+  Button,
+  Typography,
+  LinearProgress,
+  CircularProgress,
+} from '@mui/material';
+import { Image as ImageIcon, VolumeUp } from '@mui/icons-material';
+import { Scene } from '../../../../services/youtubeApi';
+import { AudioGenerationSettings } from '../../../../components/shared/AudioSettingsModal';
+import { YouTubeImageGenerationSettings } from '../../shared/YouTubeImageGenerationModal';
+
+interface GenerationButtonsProps {
+  scene: Scene;
+  isEditing: boolean;
+  loading: boolean;
+  onGenerateImage?: (scene: Scene, imageSettings?: YouTubeImageGenerationSettings) => Promise<void>;
+  generatingImage?: boolean;
+  onGenerateAudio?: (scene: Scene, audioSettings?: AudioGenerationSettings) => Promise<void>;
+  generatingAudio?: boolean;
+  imageGenerationStatus?: string;
+  imageGenerationProgress?: number;
+  audioGenerationStatus?: string;
+  audioGenerationProgress?: number;
+  onAudioModalOpen: () => void;
+  onImageModalOpen: () => void;
+}
+
+export const GenerationButtons: React.FC<GenerationButtonsProps> = ({
+  scene,
+  isEditing,
+  loading,
+  onGenerateImage,
+  generatingImage = false,
+  onGenerateAudio,
+  generatingAudio = false,
+  imageGenerationStatus = '',
+  imageGenerationProgress = 0,
+  audioGenerationStatus = '',
+  audioGenerationProgress = 0,
+  onAudioModalOpen,
+  onImageModalOpen,
+}) => {
+  if (isEditing) return null;
+
+  return (
+    <>
+      {/* Audio Generation Button */}
+      {onGenerateAudio && (
+        <Box sx={{ mt: 2 }}>
+          <Button
+            variant={scene.audioUrl ? 'outlined' : 'contained'}
+            color="primary"
+            startIcon={
+              generatingAudio ? (
+                <CircularProgress size={16} sx={{ color: 'inherit' }} />
+              ) : (
+                <VolumeUp />
+              )
+            }
+            onClick={onAudioModalOpen}
+            disabled={generatingAudio || loading}
+            sx={{
+              textTransform: 'none',
+              fontWeight: 600,
+              py: 1.5,
+              width: '100%',
+            }}
+          >
+            {generatingAudio
+              ? 'Generating Audio...'
+              : scene.audioUrl
+              ? 'Regenerate Audio'
+              : 'Generate Audio'}
+          </Button>
+          {audioGenerationStatus && (
+            <Box sx={{ mt: 1.5 }}>
+              <Typography
+                variant="caption"
+                sx={{
+                  display: 'block',
+                  mb: 0.5,
+                  color: audioGenerationStatus.startsWith('Error') ? 'error.main' : 'text.secondary',
+                  fontSize: '0.75rem',
+                }}
+              >
+                {audioGenerationStatus}
+              </Typography>
+              {audioGenerationProgress > 0 && audioGenerationProgress < 100 && (
+                <LinearProgress
+                  variant="determinate"
+                  value={audioGenerationProgress}
+                  sx={{
+                    height: 4,
+                    borderRadius: 2,
+                    bgcolor: '#e5e7eb',
+                  }}
+                />
+              )}
+            </Box>
+          )}
+        </Box>
+      )}
+
+      {/* Image Generation Button */}
+      {onGenerateImage && (
+        <Box sx={{ mt: 2 }}>
+          <Button
+            variant={scene.imageUrl ? 'outlined' : 'contained'}
+            color="primary"
+            startIcon={
+              generatingImage ? (
+                <CircularProgress size={16} sx={{ color: 'inherit' }} />
+              ) : (
+                <ImageIcon />
+              )
+            }
+            onClick={onImageModalOpen}
+            disabled={generatingImage || loading}
+            fullWidth
+            sx={{
+              textTransform: 'none',
+              fontWeight: 600,
+              py: 1.5,
+            }}
+          >
+            {generatingImage
+              ? 'Generating Image...'
+              : scene.imageUrl
+              ? 'Regenerate Image'
+              : 'Generate Image'}
+          </Button>
+          {imageGenerationStatus && (
+            <Box sx={{ mt: 1.5 }}>
+              <Typography
+                variant="caption"
+                sx={{
+                  display: 'block',
+                  mb: 0.5,
+                  color: imageGenerationStatus.startsWith('Error') ? 'error.main' : 'text.secondary',
+                  fontSize: '0.75rem',
+                }}
+              >
+                {imageGenerationStatus}
+              </Typography>
+              {imageGenerationProgress > 0 && imageGenerationProgress < 100 && (
+                <LinearProgress
+                  variant="determinate"
+                  value={imageGenerationProgress}
+                  sx={{
+                    height: 4,
+                    borderRadius: 2,
+                    bgcolor: '#e5e7eb',
+                  }}
+                />
+              )}
+            </Box>
+          )}
+        </Box>
+      )}
+    </>
+  );
+};
--- a/frontend/src/components/YouTubeCreator/components/SceneCard/GenerationModals.tsx
+++ b/frontend/src/components/YouTubeCreator/components/SceneCard/GenerationModals.tsx
@@ -0,0 +1,53 @@
+import React from 'react';
+import { AudioSettingsModal, AudioGenerationSettings } from '../../../../components/shared/AudioSettingsModal';
+import { YouTubeImageGenerationModal, YouTubeImageGenerationSettings } from '../../shared/YouTubeImageGenerationModal';
+import { Scene } from '../../../../services/youtubeApi';
+
+interface GenerationModalsProps {
+  scene: Scene;
+  showAudioSettingsModal: boolean;
+  setShowAudioSettingsModal: (show: boolean) => void;
+  showImageSettingsModal: boolean;
+  setShowImageSettingsModal: (show: boolean) => void;
+  currentAudioSettings: AudioGenerationSettings;
+  onAudioSettingsApply: (settings: AudioGenerationSettings) => void;
+  onImageSettingsApply: (settings: YouTubeImageGenerationSettings) => void;
+  generatingAudio?: boolean;
+}
+
+export const GenerationModals: React.FC<GenerationModalsProps> = ({
+  scene,
+  showAudioSettingsModal,
+  setShowAudioSettingsModal,
+  showImageSettingsModal,
+  setShowImageSettingsModal,
+  currentAudioSettings,
+  onAudioSettingsApply,
+  onImageSettingsApply,
+  generatingAudio = false,
+}) => {
+  return (
+    <>
+      <AudioSettingsModal
+        open={showAudioSettingsModal}
+        onClose={() => setShowAudioSettingsModal(false)}
+        onApplySettings={onAudioSettingsApply}
+        initialSettings={currentAudioSettings}
+        isGenerating={generatingAudio}
+        sceneTitle={scene.title}
+      />
+      <YouTubeImageGenerationModal
+        open={showImageSettingsModal}
+        onClose={() => setShowImageSettingsModal(false)}
+        onGenerate={onImageSettingsApply}
+        initialPrompt={`${scene.visual_prompt || ''}\n${scene.enhanced_visual_prompt || ''}`.trim() || `Create a YouTube scene image for: ${scene.title}`}
+        initialStyle="Realistic"
+        initialRenderingSpeed="Quality"
+        initialAspectRatio="16:9"
+        initialModel="ideogram-v3-turbo"
+        isGenerating={false} // This will be passed from parent
+        sceneTitle={scene.title}
+      />
+    </>
+  );
+};
--- a/frontend/src/components/YouTubeCreator/components/SceneCard/InfoAlert.tsx
+++ b/frontend/src/components/YouTubeCreator/components/SceneCard/InfoAlert.tsx
@@ -0,0 +1,48 @@
+import React from 'react';
+import {
+  Alert,
+  Typography,
+} from '@mui/material';
+import { Info } from '@mui/icons-material';
+import { Scene } from '../../../../services/youtubeApi';
+
+interface InfoAlertProps {
+  scene: Scene;
+  isEditing: boolean;
+  onGenerateImage?: boolean;
+  onGenerateAudio?: boolean;
+}
+
+export const InfoAlert: React.FC<InfoAlertProps> = ({
+  scene,
+  isEditing,
+  onGenerateImage = false,
+  onGenerateAudio = false,
+}) => {
+  if (isEditing) return null;
+
+  return (
+    <Alert
+      severity="info"
+      icon={<Info fontSize="small" />}
+      sx={{
+        mt: 2,
+        bgcolor: '#eff6ff',
+        border: '1px solid #bfdbfe',
+        '& .MuiAlert-icon': {
+          color: '#3b82f6',
+        },
+        '& .MuiAlert-message': {
+          color: '#1e40af',
+        },
+      }}
+    >
+      <Typography variant="caption" sx={{ fontSize: '0.75rem', lineHeight: 1.5 }}>
+        <strong>Tip:</strong> Click the edit icon above to modify narration, visual prompt, or duration.
+        {onGenerateImage && !scene.imageUrl && ' Generate an image for this scene before rendering the video.'}
+        {onGenerateAudio && !scene.audioUrl && ' Generate audio narration for this scene before rendering the video.'}
+        Disable scenes you don't need to reduce rendering cost.
+      </Typography>
+    </Alert>
+  );
+};
--- a/frontend/src/components/YouTubeCreator/components/SceneCard/SceneContent.tsx
+++ b/frontend/src/components/YouTubeCreator/components/SceneCard/SceneContent.tsx
@@ -0,0 +1,333 @@
+import React from 'react';
+import {
+  Typography,
+  Stack,
+  Box,
+  Chip,
+  Tooltip,
+  IconButton,
+  CircularProgress,
+} from '@mui/material';
+import { RecordVoiceOver, Videocam, AutoAwesome, Image as ImageIcon, VolumeUp, HelpOutline } from '@mui/icons-material';
+import { Scene } from '../../../../services/youtubeApi';
+
+interface SceneContentProps {
+  scene: Scene;
+  imageBlobUrl?: string | null;
+  imageLoading?: boolean;
+  audioBlobUrl?: string | null;
+  audioLoading?: boolean;
+}
+
+const NarrationSection: React.FC<{ narration: string }> = ({ narration }) => (
+  <Box
+    sx={{
+      p: 2,
+      bgcolor: '#f9fafb',
+      borderRadius: 1.5,
+      border: '1px solid #e5e7eb',
+    }}
+  >
+    <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1 }}>
+      <RecordVoiceOver sx={{ color: '#6366f1', fontSize: 18 }} />
+      <Typography
+        variant="subtitle2"
+        sx={{
+          fontWeight: 600,
+          fontSize: '0.875rem',
+          color: '#111827',
+          textTransform: 'uppercase',
+          letterSpacing: '0.05em',
+        }}
+      >
+        Narration
+      </Typography>
+      <Tooltip
+        title="The spoken text or voiceover for this scene. This is what will be narrated in the final video."
+        arrow
+      >
+        <IconButton size="small" sx={{ color: '#6b7280', p: 0.25, ml: 0.5 }}>
+          <HelpOutline fontSize="small" />
+        </IconButton>
+      </Tooltip>
+    </Box>
+    <Typography
+      variant="body1"
+      sx={{
+        fontStyle: 'italic',
+        color: '#374151',
+        fontSize: '0.9375rem',
+        lineHeight: 1.7,
+        fontWeight: 400,
+        pl: 0.5,
+      }}
+    >
+      "{narration}"
+    </Typography>
+  </Box>
+);
+
+const VisualPromptSection: React.FC<{ visualPrompt: string }> = ({ visualPrompt }) => (
+  <Box
+    sx={{
+      p: 2,
+      bgcolor: '#fef3c7',
+      borderRadius: 1.5,
+      border: '1px solid #fde68a',
+    }}
+  >
+    <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1 }}>
+      <Videocam sx={{ color: '#d97706', fontSize: 18 }} />
+      <Typography
+        variant="subtitle2"
+        sx={{
+          fontWeight: 600,
+          fontSize: '0.875rem',
+          color: '#92400e',
+          textTransform: 'uppercase',
+          letterSpacing: '0.05em',
+        }}
+      >
+        Visual Prompt
+      </Typography>
+      <Tooltip
+        title={
+          <Box>
+            <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
+              Visual Prompt Explained
+            </Typography>
+            <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+              This describes the visual content that will be generated for this scene. The AI uses this to create appropriate images or video clips.
+            </Typography>
+            <Typography variant="caption" sx={{ display: 'block' }}>
+              <strong>Tip:</strong> More detailed prompts lead to better visual results. Include camera angles, lighting, and composition details.
+            </Typography>
+          </Box>
+        }
+        arrow
+      >
+        <IconButton size="small" sx={{ color: '#d97706', p: 0.25, ml: 0.5 }}>
+          <HelpOutline fontSize="small" />
+        </IconButton>
+      </Tooltip>
+    </Box>
+    <Typography
+      variant="body2"
+      sx={{
+        color: '#78350f',
+        fontSize: '0.875rem',
+        lineHeight: 1.7,
+        pl: 0.5,
+        fontWeight: 400,
+      }}
+    >
+      {visualPrompt}
+    </Typography>
+  </Box>
+);
+
+const VisualCuesSection: React.FC<{ visualCues: string[] }> = ({ visualCues }) => (
+  <Box
+    sx={{
+      p: 2,
+      bgcolor: '#f0f9ff',
+      borderRadius: 1.5,
+      border: '1px solid #bae6fd',
+    }}
+  >
+    <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1.5 }}>
+      <AutoAwesome sx={{ color: '#0284c7', fontSize: 18 }} />
+      <Typography
+        variant="subtitle2"
+        sx={{
+          fontWeight: 600,
+          fontSize: '0.875rem',
+          color: '#0c4a6e',
+          textTransform: 'uppercase',
+          letterSpacing: '0.05em',
+        }}
+      >
+        Visual Cues
+      </Typography>
+      <Tooltip
+        title={
+          <Box>
+            <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
+              Visual Cues Explained
+            </Typography>
+            <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+              These are specific visual effects, camera techniques, or stylistic elements that will be applied to enhance the scene.
+            </Typography>
+            <Typography variant="caption" sx={{ display: 'block' }}>
+              Examples: Quick Zoom, Sunlight Flare, Energetic Cut, Steady Cam Walk, etc.
+            </Typography>
+          </Box>
+        }
+        arrow
+      >
+        <IconButton size="small" sx={{ color: '#0284c7', p: 0.25, ml: 0.5 }}>
+          <HelpOutline fontSize="small" />
+        </IconButton>
+      </Tooltip>
+    </Box>
+    <Stack direction="row" spacing={0.75} flexWrap="wrap" useFlexGap>
+      {visualCues.map((cue, idx) => (
+        <Tooltip
+          key={`${cue}-${idx}`}
+          title={`Visual effect: ${cue}`}
+          arrow
+        >
+          <Chip
+            label={cue}
+            size="small"
+            sx={{
+              fontSize: '0.75rem',
+              height: 28,
+              textTransform: 'capitalize',
+              borderColor: '#7dd3fc',
+              bgcolor: '#ffffff',
+              color: '#0c4a6e',
+              fontWeight: 500,
+              '&:hover': {
+                bgcolor: '#e0f2fe',
+                borderColor: '#0284c7',
+              },
+            }}
+          />
+        </Tooltip>
+      ))}
+    </Stack>
+  </Box>
+);
+
+const GeneratedMediaSection: React.FC<{
+  title: string;
+  icon: React.ReactNode;
+  children: React.ReactNode;
+}> = ({ title, icon, children }) => (
+  <Box
+    sx={{
+      mt: 2,
+      p: 2,
+      bgcolor: '#f0fdf4',
+      borderRadius: 1.5,
+      border: '1px solid #86efac',
+    }}
+  >
+    <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1.5 }}>
+      {icon}
+      <Typography
+        variant="subtitle2"
+        sx={{
+          fontWeight: 600,
+          fontSize: '0.875rem',
+          color: '#166534',
+          textTransform: 'uppercase',
+          letterSpacing: '0.05em',
+        }}
+      >
+        {title}
+      </Typography>
+      <Tooltip
+        title={`This is the AI-generated ${title.toLowerCase()} for this scene. It will be used when rendering the video.`}
+        arrow
+      >
+        <IconButton size="small" sx={{ color: '#16a34a', p: 0.25, ml: 0.5 }}>
+          <HelpOutline fontSize="small" />
+        </IconButton>
+      </Tooltip>
+    </Box>
+    {children}
+  </Box>
+);
+
+export const SceneContent: React.FC<SceneContentProps> = ({
+  scene,
+  imageBlobUrl,
+  imageLoading,
+  audioBlobUrl,
+  audioLoading,
+}) => {
+  return (
+    <Stack spacing={2.5}>
+      {/* Narration Section */}
+      <NarrationSection narration={scene.narration} />
+
+      {/* Visual Prompt Section */}
+      <VisualPromptSection visualPrompt={scene.visual_prompt} />
+
+      {/* Visual Cues Section */}
+      {scene.visual_cues && scene.visual_cues.length > 0 && (
+        <VisualCuesSection visualCues={scene.visual_cues} />
+      )}
+
+      {/* Generated Image Section */}
+      {scene.imageUrl && (
+        <GeneratedMediaSection
+          title="Generated Image"
+          icon={<ImageIcon sx={{ color: '#16a34a', fontSize: 18 }} />}
+        >
+          {imageLoading ? (
+            <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, py: 2 }}>
+              <CircularProgress size={20} />
+              <Typography variant="body2" color="text.secondary">
+                Loading image...
+              </Typography>
+            </Box>
+          ) : imageBlobUrl ? (
+            <Box
+              component="img"
+              src={imageBlobUrl}
+              alt={scene.title}
+              sx={{
+                width: '100%',
+                maxHeight: 300,
+                borderRadius: 1,
+                objectFit: 'contain',
+                border: '1px solid #86efac',
+              }}
+              onError={(e) => {
+                console.error('[SceneContent] Image failed to load:', {
+                  src: e.currentTarget.src,
+                  imageUrl: scene.imageUrl,
+                });
+              }}
+            />
+          ) : (
+            <Typography variant="body2" color="text.secondary">
+              Image not available yet. If this persists, try regenerating or refresh the page.
+            </Typography>
+          )}
+        </GeneratedMediaSection>
+      )}
+
+      {/* Generated Audio Section */}
+      {scene.audioUrl && (audioBlobUrl || audioLoading) && (
+        <GeneratedMediaSection
+          title="Generated Audio"
+          icon={<VolumeUp sx={{ color: '#16a34a', fontSize: 18 }} />}
+        >
+          {audioLoading ? (
+            <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, py: 2 }}>
+              <CircularProgress size={20} />
+              <Typography variant="body2" color="text.secondary">
+                Loading audio...
+              </Typography>
+            </Box>
+          ) : audioBlobUrl ? (
+            <Box
+              component="audio"
+              controls
+              src={audioBlobUrl}
+              sx={{
+                width: '100%',
+                borderRadius: 1,
+                border: '1px solid #86efac',
+              }}
+            />
+          ) : null}
+        </GeneratedMediaSection>
+      )}
+    </Stack>
+  );
+};
--- a/frontend/src/components/YouTubeCreator/components/SceneCard/SceneEditForm.tsx
+++ b/frontend/src/components/YouTubeCreator/components/SceneCard/SceneEditForm.tsx
@@ -0,0 +1,82 @@
+import React from 'react';
+import {
+  Stack,
+  TextField,
+  Button,
+  Box,
+} from '@mui/material';
+import { Check, Close } from '@mui/icons-material';
+import { Scene } from '../../../../services/youtubeApi';
+import { inputSx, labelSx } from '../../styles';
+
+interface SceneEditFormProps {
+  scene: Scene;
+  editedScene: Partial<Scene>;
+  onEditChange: (updates: Partial<Scene>) => void;
+  onSave: () => void;
+  onCancel: () => void;
+  loading: boolean;
+}
+
+export const SceneEditForm: React.FC<SceneEditFormProps> = ({
+  scene,
+  editedScene,
+  onEditChange,
+  onSave,
+  onCancel,
+  loading,
+}) => {
+  return (
+    <Stack spacing={2}>
+      <TextField
+        label="Narration"
+        value={editedScene.narration ?? scene.narration}
+        onChange={(e) => onEditChange({ narration: e.target.value })}
+        multiline
+        rows={3}
+        fullWidth
+        sx={inputSx}
+        InputLabelProps={{ sx: labelSx }}
+      />
+      <TextField
+        label="Visual Prompt"
+        value={editedScene.visual_prompt ?? scene.visual_prompt}
+        onChange={(e) => onEditChange({ visual_prompt: e.target.value })}
+        multiline
+        rows={2}
+        fullWidth
+        sx={inputSx}
+        InputLabelProps={{ sx: labelSx }}
+      />
+      <TextField
+        label="Duration (seconds)"
+        type="number"
+        value={editedScene.duration_estimate ?? scene.duration_estimate}
+        onChange={(e) => onEditChange({ duration_estimate: parseFloat(e.target.value) || 5 })}
+        inputProps={{ min: 1, max: 10, step: 0.5 }}
+        fullWidth
+        sx={inputSx}
+        InputLabelProps={{ sx: labelSx }}
+      />
+      <Box sx={{ display: 'flex', gap: 1 }}>
+        <Button
+          size="small"
+          variant="contained"
+          startIcon={<Check />}
+          onClick={onSave}
+          disabled={loading}
+        >
+          Save
+        </Button>
+        <Button
+          size="small"
+          variant="outlined"
+          startIcon={<Close />}
+          onClick={onCancel}
+        >
+          Cancel
+        </Button>
+      </Box>
+    </Stack>
+  );
+};
--- a/frontend/src/components/YouTubeCreator/components/SceneCard/SceneHeader.tsx
+++ b/frontend/src/components/YouTubeCreator/components/SceneCard/SceneHeader.tsx
@@ -0,0 +1,203 @@
+import React from 'react';
+import {
+  Typography,
+  Stack,
+  Chip,
+  Box,
+  FormControlLabel,
+  Switch,
+  IconButton,
+  Tooltip,
+} from '@mui/material';
+import { Edit, HelpOutline } from '@mui/icons-material';
+import { Scene } from '../../../../services/youtubeApi';
+
+// Helper functions
+const getSceneBorderColor = (emphasisTags?: string[]): string => {
+  if (!emphasisTags || emphasisTags.length === 0) return '#e5e7eb'; // Default gray
+
+  const primaryTag = emphasisTags[0];
+  switch (primaryTag) {
+    case 'hook':
+      return '#3b82f6'; // Blue for hook
+    case 'cta':
+      return '#8b5cf6'; // Purple for CTA
+    case 'transition':
+      return '#10b981'; // Green for transition
+    case 'main_content':
+    default:
+      return '#e5e7eb'; // Gray for main content
+  }
+};
+
+const getSceneIcon = (emphasisTag: string) => {
+  switch (emphasisTag) {
+    case 'hook':
+      return '🎬'; // Movie icon
+    case 'cta':
+      return '📣'; // Call made icon
+    case 'transition':
+      return '🔄'; // Shuffle icon
+    case 'main_content':
+    default:
+      return '➡️'; // Arrow forward icon
+  }
+};
+
+const getSceneChipColor = (emphasisTag: string): 'primary' | 'secondary' | 'default' => {
+  switch (emphasisTag) {
+    case 'hook':
+      return 'primary';
+    case 'cta':
+      return 'secondary';
+    default:
+      return 'default';
+  }
+};
+
+interface SceneHeaderProps {
+  scene: Scene;
+  isEditing: boolean;
+  onToggle: (sceneNumber: number) => void;
+  onEdit: (scene: Scene) => void;
+}
+
+export const SceneHeader: React.FC<SceneHeaderProps> = ({
+  scene,
+  isEditing,
+  onToggle,
+  onEdit,
+}) => {
+  return (
+    <Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'flex-start', mb: 2.5 }}>
+      <Box sx={{ flexGrow: 1 }}>
+        <Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 1.5 }}>
+          <Typography
+            variant="h6"
+            sx={{
+              mb: 0,
+              fontWeight: 700,
+              fontSize: '1.125rem',
+              color: '#111827',
+              letterSpacing: '-0.01em',
+            }}
+          >
+            Scene {scene.scene_number}: {scene.title}
+          </Typography>
+          <Tooltip
+            title={
+              <Box>
+                <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
+                  Scene Type: {scene.emphasis_tags?.[0]?.replace('_', ' ') || 'Main Content'}
+                </Typography>
+                <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                  {scene.emphasis_tags?.[0] === 'hook'
+                    ? 'Hook scenes capture attention in the first few seconds with compelling visuals or statements.'
+                    : scene.emphasis_tags?.[0] === 'cta'
+                    ? 'Call-to-action scenes encourage viewers to like, subscribe, or take a specific action.'
+                    : scene.emphasis_tags?.[0] === 'transition'
+                    ? 'Transition scenes smoothly connect different topics or segments.'
+                    : 'Main content scenes deliver the core message and information.'}
+                </Typography>
+                <Typography variant="caption" sx={{ display: 'block' }}>
+                  Duration: {scene.duration_estimate}s • This affects rendering cost.
+                </Typography>
+              </Box>
+            }
+            arrow
+            placement="top"
+          >
+            <IconButton size="small" sx={{ color: '#6b7280', p: 0.5 }}>
+              <HelpOutline fontSize="small" />
+            </IconButton>
+          </Tooltip>
+        </Box>
+        <Stack direction="row" spacing={1} sx={{ mb: 0 }} flexWrap="wrap" useFlexGap>
+          {scene.emphasis_tags?.map((tag) => (
+            <Tooltip
+              key={tag}
+              title={
+                tag === 'hook'
+                  ? 'Hook: Grabs viewer attention immediately'
+                  : tag === 'cta'
+                  ? 'CTA: Encourages viewer action'
+                  : tag === 'transition'
+                  ? 'Transition: Connects segments smoothly'
+                  : 'Main Content: Core message delivery'
+              }
+              arrow
+            >
+              <Chip
+                label={tag.replace('_', ' ')}
+                size="small"
+                color={getSceneChipColor(tag)}
+                icon={<span>{getSceneIcon(tag)}</span>}
+                sx={{
+                  textTransform: 'capitalize',
+                  fontWeight: 600,
+                  fontSize: '0.75rem',
+                }}
+              />
+            </Tooltip>
+          ))}
+          <Tooltip
+            title="Estimated duration in seconds. Longer scenes cost more to render but provide more detail."
+            arrow
+          >
+            <Chip
+              label={`~${scene.duration_estimate}s`}
+              size="small"
+              variant="outlined"
+              sx={{
+                ml: 'auto',
+                fontWeight: 600,
+                fontSize: '0.75rem',
+                borderColor: '#d1d5db',
+                color: '#374151',
+              }}
+            />
+          </Tooltip>
+        </Stack>
+      </Box>
+      <Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
+        <Tooltip
+          title={
+            scene.enabled !== false
+              ? 'Disable this scene to exclude it from rendering and reduce cost'
+              : 'Enable this scene to include it in the final video'
+          }
+          arrow
+        >
+          <FormControlLabel
+            control={
+              <Switch
+                checked={scene.enabled !== false}
+                onChange={() => onToggle(scene.scene_number)}
+                size="small"
+              />
+            }
+            label="Enable"
+            sx={{ mr: 0 }}
+          />
+        </Tooltip>
+        {!isEditing && (
+          <Tooltip title="Edit scene narration, visual prompt, or duration" arrow>
+            <IconButton
+              size="small"
+              onClick={() => onEdit(scene)}
+              color="primary"
+              sx={{
+                border: '1px solid #e5e7eb',
+                '&:hover': {
+                  bgcolor: '#f9fafb',
+                },
+              }}
+            >
+              <Edit fontSize="small" />
+            </IconButton>
+          </Tooltip>
+        )}
+      </Box>
+    </Box>
+  );
+};
--- a/frontend/src/components/YouTubeCreator/components/SceneCard/hooks/index.ts
+++ b/frontend/src/components/YouTubeCreator/components/SceneCard/hooks/index.ts
@@ -0,0 +1,2 @@
+export { useSceneMedia } from './useSceneMedia';
+export { useGenerationState } from './useGenerationState';
--- a/frontend/src/components/YouTubeCreator/components/SceneCard/hooks/useGenerationState.ts
+++ b/frontend/src/components/YouTubeCreator/components/SceneCard/hooks/useGenerationState.ts
@@ -0,0 +1,67 @@
+// Hook for managing generation state
+import { useState } from 'react';
+
+interface AudioSettings {
+  voiceId: string;
+  speed: number;
+  volume: number;
+  pitch: number;
+  emotion: string;
+  englishNormalization: boolean;
+  bitrate: number;
+  channel: string;
+  format: string;
+  enableSyncMode: boolean;
+}
+
+export const useGenerationState = () => {
+  const [showAudioSettingsModal, setShowAudioSettingsModal] = useState(false);
+  const [showImageSettingsModal, setShowImageSettingsModal] = useState(false);
+
+  const [currentAudioSettings, setCurrentAudioSettings] = useState<AudioSettings>({
+    voiceId: "Casual_Guy",
+    speed: 1.15,
+    volume: 1.0,
+    pitch: 0.0,
+    emotion: "happy",
+    englishNormalization: true,
+    bitrate: 128000,
+    channel: "1",
+    format: "mp3",
+    enableSyncMode: true,
+  });
+
+  const [imageGenerationProgress, setImageGenerationProgress] = useState(0);
+  const [imageGenerationStatus, setImageGenerationStatus] = useState<string>('');
+  const [audioGenerationProgress, setAudioGenerationProgress] = useState(0);
+  const [audioGenerationStatus, setAudioGenerationStatus] = useState<string>('');
+
+  const resetImageGeneration = () => {
+    setImageGenerationStatus('');
+    setImageGenerationProgress(0);
+  };
+
+  const resetAudioGeneration = () => {
+    setAudioGenerationStatus('');
+    setAudioGenerationProgress(0);
+  };
+
+  return {
+    showAudioSettingsModal,
+    setShowAudioSettingsModal,
+    showImageSettingsModal,
+    setShowImageSettingsModal,
+    currentAudioSettings,
+    setCurrentAudioSettings,
+    imageGenerationProgress,
+    setImageGenerationProgress,
+    imageGenerationStatus,
+    setImageGenerationStatus,
+    audioGenerationProgress,
+    setAudioGenerationProgress,
+    audioGenerationStatus,
+    setAudioGenerationStatus,
+    resetImageGeneration,
+    resetAudioGeneration,
+  };
+};
--- a/frontend/src/components/YouTubeCreator/components/SceneCard/hooks/useSceneMedia.ts
+++ b/frontend/src/components/YouTubeCreator/components/SceneCard/hooks/useSceneMedia.ts
@@ -0,0 +1,54 @@
+// Hook for managing scene media (images and audio)
+import { useState, useEffect } from 'react';
+import { fetchMediaBlobUrl } from '../../../../../utils/fetchMediaBlobUrl';
+
+interface UseSceneMediaProps {
+  imageUrl?: string | null;
+  audioUrl?: string | null;
+}
+
+export const useSceneMedia = ({ imageUrl, audioUrl }: UseSceneMediaProps) => {
+  const [imageBlobUrl, setImageBlobUrl] = useState<string | null>(null);
+  const [imageLoading, setImageLoading] = useState(false);
+  const [audioBlobUrl, setAudioBlobUrl] = useState<string | null>(null);
+  const [audioLoading, setAudioLoading] = useState(false);
+
+  useEffect(() => {
+    if (imageUrl) {
+      setImageLoading(true);
+      fetchMediaBlobUrl(imageUrl)
+        .then(setImageBlobUrl)
+        .catch(console.error)
+        .finally(() => setImageLoading(false));
+    } else {
+      setImageBlobUrl(null);
+    }
+
+    return () => {
+      if (imageBlobUrl) URL.revokeObjectURL(imageBlobUrl);
+    };
+  }, [imageUrl]);
+
+  useEffect(() => {
+    if (audioUrl) {
+      setAudioLoading(true);
+      fetchMediaBlobUrl(audioUrl)
+        .then(setAudioBlobUrl)
+        .catch(console.error)
+        .finally(() => setAudioLoading(false));
+    } else {
+      setAudioBlobUrl(null);
+    }
+
+    return () => {
+      if (audioBlobUrl) URL.revokeObjectURL(audioBlobUrl);
+    };
+  }, [audioUrl]);
+
+  return {
+    imageBlobUrl,
+    imageLoading,
+    audioBlobUrl,
+    audioLoading,
+  };
+};
--- a/frontend/src/components/YouTubeCreator/components/SceneGenerationStep.tsx
+++ b/frontend/src/components/YouTubeCreator/components/SceneGenerationStep.tsx
@@ -0,0 +1,215 @@
+/**
+ * Scene Generation Step Component
+ * 
+ * Third step: Generate images and audio for each scene before video rendering.
+ */
+
+import React, { useMemo } from 'react';
+import {
+  Paper,
+  Typography,
+  Stack,
+  Button,
+  Box,
+  Alert,
+} from '@mui/material';
+import { ArrowForward, ArrowBack, CheckCircle, Warning } from '@mui/icons-material';
+import { motion } from 'framer-motion';
+import { Scene, VideoPlan } from '../../../services/youtubeApi';
+import { YT_BG, YT_BORDER, YT_TEXT } from '../constants';
+import { SceneCard } from './SceneCard';
+import { AssetGenerationCostCard } from './AssetGenerationCostCard';
+import type { AudioGenerationSettings } from '../../shared/AudioSettingsModal';
+import type { YouTubeImageGenerationSettings } from '../shared';
+
+interface SceneGenerationStepProps {
+  scenes: Scene[];
+  videoPlan: VideoPlan | null;
+  editingSceneId: number | null;
+  editedScene: Partial<Scene> | null;
+  onEditScene: (scene: Scene) => void;
+  onSaveScene: () => void;
+  onCancelEdit: () => void;
+  onEditChange: (updates: Partial<Scene>) => void;
+  onToggleScene: (sceneNumber: number) => void;
+  onGenerateImage?: (scene: Scene, settings?: YouTubeImageGenerationSettings) => Promise<void>;
+  generatingImageSceneId?: number | null;
+  onGenerateAudio?: (scene: Scene, settings?: AudioGenerationSettings) => Promise<void>;
+  generatingAudioSceneId?: number | null;
+  loading: boolean;
+  avatarUrl?: string | null;
+  videoPlanIdea?: string;
+  onBack: () => void;
+  onNext: () => void;
+}
+
+export const SceneGenerationStep: React.FC<SceneGenerationStepProps> = React.memo(({
+  scenes,
+  videoPlan,
+  editingSceneId,
+  editedScene,
+  onEditScene,
+  onSaveScene,
+  onCancelEdit,
+  onEditChange,
+  onToggleScene,
+  onGenerateImage,
+  generatingImageSceneId,
+  onGenerateAudio,
+  generatingAudioSceneId,
+  loading,
+  avatarUrl,
+  videoPlanIdea,
+  onBack,
+  onNext,
+}) => {
+  // Check scene readiness: all enabled scenes must have both imageUrl and audioUrl
+  const sceneReadiness = useMemo(() => {
+    const enabledScenes = scenes.filter(s => s.enabled !== false);
+    const readyScenes = enabledScenes.filter(s => s.imageUrl && s.audioUrl);
+    const missingImage = enabledScenes.filter(s => !s.imageUrl);
+    const missingAudio = enabledScenes.filter(s => !s.audioUrl);
+    
+    return {
+      allReady: enabledScenes.length > 0 && readyScenes.length === enabledScenes.length,
+      readyCount: readyScenes.length,
+      totalEnabled: enabledScenes.length,
+      missingImageCount: missingImage.length,
+      missingAudioCount: missingAudio.length,
+      scenesMissingImages: missingImage.map(s => s.scene_number),
+      scenesMissingAudio: missingAudio.map(s => s.scene_number),
+    };
+  }, [scenes]);
+
+  const canProceed = sceneReadiness.allReady;
+
+  return (
+    <motion.div
+      initial={{ opacity: 0, y: 20 }}
+      animate={{ opacity: 1, y: 0 }}
+    >
+      <Paper
+        sx={{
+          p: 4,
+          backgroundColor: 'white',
+          border: `1px solid ${YT_BORDER}`,
+        }}
+      >
+        <Typography variant="h5" sx={{ mb: 3, fontWeight: 600, color: YT_TEXT }}>
+          3️⃣ Generate Scene Assets
+        </Typography>
+
+        <Typography variant="body2" sx={{ mb: 3, color: '#64748b' }}>
+          Generate custom images and audio narration for each scene. All scenes must have both an image and audio before you can render the final video.
+        </Typography>
+
+        {/* Cost Estimate */}
+        <AssetGenerationCostCard scenes={scenes} />
+
+        {/* Readiness Alert */}
+        {sceneReadiness.allReady ? (
+          <Alert 
+            severity="success" 
+            icon={<CheckCircle />}
+            sx={{
+              mb: 3,
+              bgcolor: '#f0fdf4',
+              border: '1px solid #86efac',
+              '& .MuiAlert-icon': {
+                color: '#16a34a',
+              },
+            }}
+          >
+            <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
+              ✅ All scenes are ready!
+            </Typography>
+            <Typography variant="caption" sx={{ fontSize: '0.75rem' }}>
+              {sceneReadiness.readyCount} of {sceneReadiness.totalEnabled} enabled scenes have both images and audio. You can proceed to render your video.
+            </Typography>
+          </Alert>
+        ) : (
+          <Alert 
+            severity="warning" 
+            icon={<Warning />}
+            sx={{
+              mb: 3,
+              bgcolor: '#fffbeb',
+              border: '1px solid #fde68a',
+              '& .MuiAlert-icon': {
+                color: '#d97706',
+              },
+            }}
+          >
+            <Typography variant="body2" sx={{ fontWeight: 600, mb: 1 }}>
+              Some scenes need assets generated
+            </Typography>
+            <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+              • {sceneReadiness.missingImageCount} scene(s) need images: {sceneReadiness.scenesMissingImages.join(', ')}
+            </Typography>
+            <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+              • {sceneReadiness.missingAudioCount} scene(s) need audio: {sceneReadiness.scenesMissingAudio.join(', ')}
+            </Typography>
+            <Typography variant="caption" sx={{ display: 'block', mt: 1, fontStyle: 'italic' }}>
+              Click "Generate Image" and "Generate Audio" buttons on each scene card below.
+            </Typography>
+          </Alert>
+        )}
+
+        {/* Scene Cards */}
+        {scenes.length > 0 && (
+          <Box sx={{ mb: 3 }}>
+            <Stack spacing={2}>
+              {scenes.map((scene) => (
+                <SceneCard
+                  key={scene.scene_number}
+                  scene={scene}
+                  isEditing={editingSceneId === scene.scene_number}
+                  editedScene={editedScene}
+                  onToggle={onToggleScene}
+                  onEdit={onEditScene}
+                  onSave={onSaveScene}
+                  onCancel={onCancelEdit}
+                  onEditChange={onEditChange}
+                  loading={loading}
+                  onGenerateImage={onGenerateImage}
+                  generatingImage={generatingImageSceneId === scene.scene_number}
+                  onGenerateAudio={onGenerateAudio}
+                  generatingAudio={generatingAudioSceneId === scene.scene_number}
+                  avatarUrl={avatarUrl}
+                  videoPlanIdea={videoPlanIdea}
+                />
+              ))}
+            </Stack>
+          </Box>
+        )}
+
+        {/* Action Buttons */}
+        <Box sx={{ display: 'flex', gap: 2, mt: 4 }}>
+          <Button
+            variant="outlined"
+            startIcon={<ArrowBack />}
+            onClick={onBack}
+          >
+            Back to Scenes
+          </Button>
+          <Button
+            variant="contained"
+            color="error"
+            size="large"
+            endIcon={<ArrowForward />}
+            onClick={onNext}
+            disabled={!canProceed}
+            sx={{ px: 4 }}
+          >
+            {canProceed 
+              ? 'Proceed to Video Rendering'
+              : `Generate Assets (${sceneReadiness.readyCount}/${sceneReadiness.totalEnabled} ready)`}
+          </Button>
+        </Box>
+      </Paper>
+    </motion.div>
+  );
+});
+
+SceneGenerationStep.displayName = 'SceneGenerationStep';
+
--- a/frontend/src/components/YouTubeCreator/components/ScenePreviewModal.tsx
+++ b/frontend/src/components/YouTubeCreator/components/ScenePreviewModal.tsx
@@ -0,0 +1,249 @@
+/**
+ * Scene Preview Modal
+ * 
+ * Shows a preview of scene image and audio with playback controls.
+ */
+
+import React, { useState, useEffect } from 'react';
+import {
+  Dialog,
+  DialogTitle,
+  DialogContent,
+  IconButton,
+  Box,
+  Typography,
+  Stack,
+  CircularProgress,
+} from '@mui/material';
+import {
+  Close as CloseIcon,
+  PlayArrow,
+  Pause,
+  VolumeUp,
+} from '@mui/icons-material';
+import { fetchMediaBlobUrl } from '../../../utils/fetchMediaBlobUrl';
+
+interface ScenePreviewModalProps {
+  open: boolean;
+  onClose: () => void;
+  sceneTitle: string;
+  sceneNumber: number;
+  imageUrl?: string | null;
+  audioUrl?: string | null;
+}
+
+export const ScenePreviewModal: React.FC<ScenePreviewModalProps> = ({
+  open,
+  onClose,
+  sceneTitle,
+  sceneNumber,
+  imageUrl,
+  audioUrl,
+}) => {
+  const [imageBlobUrl, setImageBlobUrl] = useState<string | null>(null);
+  const [audioBlobUrl, setAudioBlobUrl] = useState<string | null>(null);
+  const [imageLoading, setImageLoading] = useState(false);
+  const [audioLoading, setAudioLoading] = useState(false);
+  const [isPlaying, setIsPlaying] = useState(false);
+  const [audioElement, setAudioElement] = useState<HTMLAudioElement | null>(null);
+
+  // Load image blob
+  useEffect(() => {
+    if (!imageUrl || !open) {
+      setImageBlobUrl(null);
+      return;
+    }
+
+    setImageLoading(true);
+    fetchMediaBlobUrl(imageUrl)
+      .then(setImageBlobUrl)
+      .catch(console.error)
+      .finally(() => setImageLoading(false));
+
+    return () => {
+      if (imageBlobUrl?.startsWith('blob:')) {
+        URL.revokeObjectURL(imageBlobUrl);
+      }
+    };
+  }, [imageUrl, open]);
+
+  // Load audio blob
+  useEffect(() => {
+    if (!audioUrl || !open) {
+      setAudioBlobUrl(null);
+      return;
+    }
+
+    setAudioLoading(true);
+    fetchMediaBlobUrl(audioUrl)
+      .then(setAudioBlobUrl)
+      .catch(console.error)
+      .finally(() => setAudioLoading(false));
+
+    return () => {
+      if (audioBlobUrl?.startsWith('blob:')) {
+        URL.revokeObjectURL(audioBlobUrl);
+      }
+    };
+  }, [audioUrl, open]);
+
+  // Create audio element
+  useEffect(() => {
+    if (audioBlobUrl) {
+      const audio = new Audio(audioBlobUrl);
+      audio.addEventListener('ended', () => setIsPlaying(false));
+      setAudioElement(audio);
+      return () => {
+        audio.pause();
+        audio.remove();
+      };
+    }
+  }, [audioBlobUrl]);
+
+  const togglePlayPause = () => {
+    if (!audioElement) return;
+
+    if (isPlaying) {
+      audioElement.pause();
+    } else {
+      audioElement.play();
+    }
+    setIsPlaying(!isPlaying);
+  };
+
+  const handleClose = () => {
+    if (audioElement) {
+      audioElement.pause();
+      setIsPlaying(false);
+    }
+    onClose();
+  };
+
+  return (
+    <Dialog
+      open={open}
+      onClose={handleClose}
+      maxWidth="md"
+      fullWidth
+      PaperProps={{
+        sx: {
+          borderRadius: 3,
+          bgcolor: '#f8fafc',
+        },
+      }}
+    >
+      <DialogTitle>
+        <Stack direction="row" justifyContent="space-between" alignItems="center">
+          <Box>
+            <Typography variant="h6" sx={{ fontWeight: 700, color: '#1e293b' }}>
+              Scene {sceneNumber} Preview
+            </Typography>
+            <Typography variant="body2" sx={{ color: '#64748b', mt: 0.5 }}>
+              {sceneTitle}
+            </Typography>
+          </Box>
+          <IconButton onClick={handleClose} size="small">
+            <CloseIcon />
+          </IconButton>
+        </Stack>
+      </DialogTitle>
+
+      <DialogContent>
+        <Stack spacing={3}>
+          {/* Image Preview */}
+          {imageUrl && (
+            <Box>
+              <Typography variant="subtitle2" sx={{ mb: 1.5, fontWeight: 600, color: '#475569' }}>
+                🖼️ Scene Image
+              </Typography>
+              {imageLoading ? (
+                <Box sx={{ display: 'flex', justifyContent: 'center', p: 4 }}>
+                  <CircularProgress />
+                </Box>
+              ) : imageBlobUrl ? (
+                <Box
+                  component="img"
+                  src={imageBlobUrl}
+                  alt={sceneTitle}
+                  sx={{
+                    width: '100%',
+                    height: 'auto',
+                    borderRadius: 2,
+                    boxShadow: '0 4px 6px -1px rgba(0, 0, 0, 0.1)',
+                  }}
+                />
+              ) : (
+                <Typography variant="body2" color="text.secondary">
+                  Failed to load image
+                </Typography>
+              )}
+            </Box>
+          )}
+
+          {/* Audio Preview */}
+          {audioUrl && (
+            <Box>
+              <Typography variant="subtitle2" sx={{ mb: 1.5, fontWeight: 600, color: '#475569' }}>
+                🎤 Scene Audio
+              </Typography>
+              {audioLoading ? (
+                <Box sx={{ display: 'flex', justifyContent: 'center', p: 2 }}>
+                  <CircularProgress size={24} />
+                </Box>
+              ) : audioBlobUrl ? (
+                <Box
+                  sx={{
+                    p: 3,
+                    bgcolor: 'white',
+                    borderRadius: 2,
+                    border: '2px solid #e2e8f0',
+                    display: 'flex',
+                    alignItems: 'center',
+                    gap: 2,
+                  }}
+                >
+                  <IconButton
+                    onClick={togglePlayPause}
+                    disabled={!audioElement}
+                    sx={{
+                      bgcolor: '#667eea',
+                      color: 'white',
+                      '&:hover': {
+                        bgcolor: '#5568d3',
+                      },
+                      '&:disabled': {
+                        bgcolor: '#cbd5e1',
+                      },
+                    }}
+                  >
+                    {isPlaying ? <Pause /> : <PlayArrow />}
+                  </IconButton>
+                  <Box sx={{ flex: 1 }}>
+                    <Typography variant="body2" sx={{ fontWeight: 600, color: '#1e293b' }}>
+                      {isPlaying ? 'Playing...' : 'Click to play audio'}
+                    </Typography>
+                    <Typography variant="caption" sx={{ color: '#64748b' }}>
+                      Scene narration audio
+                    </Typography>
+                  </Box>
+                  <VolumeUp sx={{ color: '#94a3b8' }} />
+                </Box>
+              ) : (
+                <Typography variant="body2" color="text.secondary">
+                  Failed to load audio
+                </Typography>
+              )}
+            </Box>
+          )}
+
+          {!imageUrl && !audioUrl && (
+            <Typography variant="body2" color="text.secondary" sx={{ textAlign: 'center', p: 3 }}>
+              No assets available for preview
+            </Typography>
+          )}
+        </Stack>
+      </DialogContent>
+    </Dialog>
+  );
+};
+
--- a/frontend/src/components/YouTubeCreator/components/VideoRenderQueue.tsx
+++ b/frontend/src/components/YouTubeCreator/components/VideoRenderQueue.tsx
@@ -0,0 +1,179 @@
+import React, { useMemo } from 'react';
+import { Box, Paper, Stack, Typography, Button, LinearProgress, Alert, Chip } from '@mui/material';
+import { PlayArrow, VideoLibrary, CheckCircle, ErrorOutline } from '@mui/icons-material';
+import { Scene, VideoPlan } from '../../../services/youtubeApi';
+import { useVideoRenderQueue, SceneVideoJob } from '../hooks/useVideoRenderQueue';
+
+interface VideoRenderQueueProps {
+  scenes: Scene[];
+  videoPlan: VideoPlan | null;
+  resolution: '480p' | '720p' | '1080p';
+  onSceneVideoReady: (sceneNumber: number, videoUrl: string) => void;
+  onFinalVideoReady?: (videoUrl: string) => void;
+}
+
+const statusColor = (job?: SceneVideoJob) => {
+  if (!job) return 'default';
+  if (job.status === 'completed') return 'success';
+  if (job.status === 'failed') return 'error';
+  if (job.status === 'running') return 'info';
+  return 'default';
+};
+
+export const VideoRenderQueue: React.FC<VideoRenderQueueProps> = ({
+  scenes,
+  videoPlan,
+  resolution,
+  onSceneVideoReady,
+  onFinalVideoReady,
+}) => {
+  const {
+    jobs,
+    runSceneVideo,
+    combineVideos,
+    combineStatus,
+    combineProgress,
+  } = useVideoRenderQueue({
+    scenes,
+    videoPlan,
+    resolution,
+    onSceneVideoReady,
+    onCombineReady: onFinalVideoReady,
+  });
+
+  const allVideosReady = useMemo(() => {
+    const enabled = scenes.filter((s) => s.enabled !== false);
+    if (enabled.length === 0) return false;
+    return enabled.every((s) => jobs[s.scene_number]?.videoUrl);
+  }, [jobs, scenes]);
+
+  return (
+    <Paper sx={{ p: 3, mt: 2 }}>
+      <Typography variant="h6" sx={{ fontWeight: 700, mb: 2 }}>
+        Scene-wise Video Generation
+      </Typography>
+      <Typography variant="body2" color="text.secondary" sx={{ mb: 3 }}>
+        Generate videos per scene to save costs and retry only failing scenes. Once all scene videos are ready, combine them into a final video.
+      </Typography>
+
+      <Stack spacing={2}>
+        {scenes.map((scene) => {
+          const job = jobs[scene.scene_number];
+          return (
+            <Paper key={scene.scene_number} variant="outlined" sx={{ p: 2 }}>
+              <Stack direction="row" justifyContent="space-between" alignItems="center" spacing={2} flexWrap="wrap">
+                <Box>
+                  <Typography variant="subtitle1" sx={{ fontWeight: 600 }}>
+                    Scene {scene.scene_number}: {scene.title}
+                  </Typography>
+                  <Typography variant="caption" color="text.secondary">
+                    {scene.imageUrl ? '✅ Image ready' : '⚠️ Image missing'} · {scene.audioUrl ? '✅ Audio ready' : '⚠️ Audio missing'}
+                  </Typography>
+                  {job?.error && (
+                    <Alert severity="error" sx={{ mt: 1 }}>
+                      {job.error}
+                    </Alert>
+                  )}
+                </Box>
+                <Stack direction="row" spacing={1} alignItems="center" flexWrap="wrap">
+                  <Chip
+                    label={job?.status ?? 'idle'}
+                    color={statusColor(job) as any}
+                    size="small"
+                    variant="outlined"
+                  />
+                  <Button
+                    variant="contained"
+                    size="small"
+                    startIcon={<PlayArrow />}
+                    disabled={job?.status === 'running'}
+                    onClick={() => runSceneVideo(scene, { generateAudio: false }).catch(() => {})}
+                  >
+                    {job?.status === 'running'
+                      ? 'Generating...'
+                      : job?.status === 'completed'
+                      ? 'Regenerate Video'
+                      : 'Generate Video'}
+                  </Button>
+                  {job?.videoUrl && (
+                    <Button
+                      variant="outlined"
+                      size="small"
+                      href={job.videoUrl}
+                      target="_blank"
+                      rel="noreferrer"
+                    >
+                      Preview
+                    </Button>
+                  )}
+                </Stack>
+              </Stack>
+              {job?.status === 'running' && (
+                <Box sx={{ mt: 1.5 }}>
+                  <LinearProgress variant="determinate" value={job.progress || 0} sx={{ height: 6, borderRadius: 2 }} />
+                  <Typography variant="caption" color="text.secondary">
+                    {Math.round(job.progress || 0)}%
+                  </Typography>
+                </Box>
+              )}
+            </Paper>
+          );
+        })}
+      </Stack>
+
+      <Box sx={{ mt: 3, p: 2, border: '1px solid #e5e7eb', borderRadius: 2 }}>
+        <Typography variant="subtitle1" sx={{ fontWeight: 600, mb: 1 }}>
+          Final Video
+        </Typography>
+        {!allVideosReady && (
+          <Alert severity="info" icon={<VideoLibrary />}>
+            Generate videos for all enabled scenes to combine them into a single final video.
+          </Alert>
+        )}
+        {allVideosReady && (
+          <Stack spacing={1}>
+            <Typography variant="body2" color="text.secondary">
+              All scene videos are ready. Combine into a final video.
+            </Typography>
+            {combineStatus === 'running' && (
+              <Box>
+                <LinearProgress
+                  variant="determinate"
+                  value={combineProgress || 0}
+                  sx={{ height: 6, borderRadius: 2, mb: 0.5 }}
+                />
+                <Typography variant="caption" color="text.secondary">
+                  {Math.round(combineProgress || 0)}%
+                </Typography>
+              </Box>
+            )}
+            <Stack direction="row" spacing={1} alignItems="center">
+              <Button
+                variant="contained"
+                color="secondary"
+                startIcon={<VideoLibrary />}
+                disabled={combineStatus === 'running'}
+                onClick={() =>
+                  combineVideos(
+                    scenes
+                      .filter((s) => s.enabled !== false)
+                      .map((s) => jobs[s.scene_number]?.videoUrl)
+                      .filter(Boolean) as string[],
+                    videoPlan?.video_summary
+                  ).catch(() => {})
+                }
+              >
+                {combineStatus === 'running' ? 'Combining...' : 'Combine Scenes'}
+              </Button>
+              {combineStatus === 'completed' && <Chip icon={<CheckCircle />} color="success" label="Final video ready" />}
+              {combineStatus === 'failed' && (
+                <Chip icon={<ErrorOutline />} color="error" label="Combine failed, retry" />
+              )}
+            </Stack>
+          </Stack>
+        )}
+      </Box>
+    </Paper>
+  );
+};
+
--- a/frontend/src/components/YouTubeCreator/constants.ts
+++ b/frontend/src/components/YouTubeCreator/constants.ts
@@ -7,7 +7,7 @@ export const YT_BG = '#f9f9f9';
 export const YT_BORDER = '#e5e5e5';
 export const YT_TEXT = '#0f0f0f';

-export const STEPS = ['Plan Your Video', 'Review Scenes', 'Render Video'] as const;
+export const STEPS = ['Plan Your Video', 'Review Scenes', 'Generate Assets', 'Render Video'] as const;

 export const RESOLUTIONS = ['480p', '720p', '1080p'] as const;
 export type Resolution = typeof RESOLUTIONS[number];
--- a/frontend/src/components/YouTubeCreator/hooks/useCostEstimate.ts
+++ b/frontend/src/components/YouTubeCreator/hooks/useCostEstimate.ts
@@ -11,24 +11,40 @@ interface UseCostEstimateParams {
  scenes: Scene[];
  resolution: Resolution;
  renderTaskId: string | null;
+  imageModel?: 'ideogram-v3-turbo' | 'qwen-image';
 }

-export const useCostEstimate = ({ activeStep, scenes, resolution, renderTaskId }: UseCostEstimateParams) => {
+export const useCostEstimate = ({ activeStep, scenes, resolution, renderTaskId, imageModel = 'ideogram-v3-turbo' }: UseCostEstimateParams) => {
  const [costEstimate, setCostEstimate] = useState<CostEstimate | null>(null);
  const [loadingCostEstimate, setLoadingCostEstimate] = useState(false);

  useEffect(() => {
-    if (activeStep === 2 && scenes.length > 0 && !renderTaskId) {
+    // Fetch cost estimate on both "Generate Assets" (step 2) and "Render Video" (step 3) steps
+    if ((activeStep === 2 || activeStep === 3) && scenes.length > 0 && !renderTaskId) {
      const fetchCostEstimate = async () => {
        setLoadingCostEstimate(true);
        try {
          const enabledScenes = scenes.filter(s => s.enabled !== false);
+          
+          // Only fetch if all enabled scenes have images and audio
+          const allScenesReady = enabledScenes.every(s => s.imageUrl && s.audioUrl);
+          
+          if (!allScenesReady && activeStep === 3) {
+            // On render step, require all scenes to be ready
+            setCostEstimate(null);
+            setLoadingCostEstimate(false);
+            return;
+          }
+          
          const response = await youtubeApi.estimateCost({
            scenes: enabledScenes,
            resolution: resolution,
+            imageModel: imageModel,
          });
          if (response.success && response.estimate) {
            setCostEstimate(response.estimate);
+          } else {
+            setCostEstimate(null);
          }
        } catch (err: any) {
          console.error('Error estimating cost:', err);
@@ -39,8 +55,12 @@ export const useCostEstimate = ({ activeStep, scenes, resolution, renderTaskId }
      };

      fetchCostEstimate();
+    } else {
+      // Reset cost estimate when not on relevant steps
+      setCostEstimate(null);
+      setLoadingCostEstimate(false);
    }
-  }, [activeStep, scenes, resolution, renderTaskId]);
+  }, [activeStep, scenes, resolution, renderTaskId, imageModel]);

  return { costEstimate, loadingCostEstimate };
 };
--- a/frontend/src/components/YouTubeCreator/hooks/useGenerationState.ts
+++ b/frontend/src/components/YouTubeCreator/hooks/useGenerationState.ts
@@ -0,0 +1,55 @@
+// Hook for managing generation state
+import { useState } from 'react';
+import { AudioGenerationSettings } from '../../shared/AudioSettingsModal';
+
+export const useGenerationState = () => {
+  const [showAudioSettingsModal, setShowAudioSettingsModal] = useState(false);
+  const [showImageSettingsModal, setShowImageSettingsModal] = useState(false);
+
+  const [currentAudioSettings, setCurrentAudioSettings] = useState<AudioGenerationSettings>({
+    voiceId: "Casual_Guy",
+    speed: 1.15,
+    volume: 1.0,
+    pitch: 0.0,
+    emotion: "happy",
+    englishNormalization: true,
+    bitrate: 128000,
+    channel: "1",
+    format: "mp3",
+    enableSyncMode: true,
+  });
+
+  const [imageGenerationProgress, setImageGenerationProgress] = useState(0);
+  const [imageGenerationStatus, setImageGenerationStatus] = useState<string>('');
+  const [audioGenerationProgress, setAudioGenerationProgress] = useState(0);
+  const [audioGenerationStatus, setAudioGenerationStatus] = useState<string>('');
+
+  const resetImageGeneration = () => {
+    setImageGenerationStatus('');
+    setImageGenerationProgress(0);
+  };
+
+  const resetAudioGeneration = () => {
+    setAudioGenerationStatus('');
+    setAudioGenerationProgress(0);
+  };
+
+  return {
+    showAudioSettingsModal,
+    setShowAudioSettingsModal,
+    showImageSettingsModal,
+    setShowImageSettingsModal,
+    currentAudioSettings,
+    setCurrentAudioSettings,
+    imageGenerationProgress,
+    setImageGenerationProgress,
+    imageGenerationStatus,
+    setImageGenerationStatus,
+    audioGenerationProgress,
+    setAudioGenerationProgress,
+    audioGenerationStatus,
+    setAudioGenerationStatus,
+    resetImageGeneration,
+    resetAudioGeneration,
+  };
+};
--- a/frontend/src/components/YouTubeCreator/hooks/useImageGenerationPolling.ts
+++ b/frontend/src/components/YouTubeCreator/hooks/useImageGenerationPolling.ts
@@ -0,0 +1,188 @@
+/**
+ * Custom hook for robust image generation polling
+ * 
+ * Handles:
+ * - Proper cleanup on unmount
+ * - Retry logic with exponential backoff
+ * - Timeout handling
+ * - Error classification and handling
+ * - Race condition prevention
+ */
+
+import { useRef, useCallback, useEffect } from 'react';
+
+interface PollingOptions {
+  taskId: string;
+  sceneNumber: number;
+  onComplete: (imageUrl: string) => void;
+  onError: (error: string) => void;
+  onProgress?: (progress: number, message: string) => void;
+  pollInterval?: number;
+  maxPollTime?: number;
+  maxRetries?: number;
+  getStatus: (taskId: string) => Promise<any>;
+}
+
+export const useImageGenerationPolling = () => {
+  const activePollingRef = useRef<Map<string, () => void>>(new Map());
+
+  const startPolling = useCallback((options: PollingOptions) => {
+    const {
+      taskId,
+      sceneNumber,
+      onComplete,
+      onError,
+      onProgress,
+      pollInterval = 3000,
+      maxPollTime = 5 * 60 * 1000, // 5 minutes
+      maxRetries = 3,
+      getStatus,
+    } = options;
+
+    // If already polling this task, stop it first
+    const existingCleanup = activePollingRef.current.get(taskId);
+    if (existingCleanup) {
+      existingCleanup();
+    }
+
+    const pollIntervalRef = { current: null as NodeJS.Timeout | null };
+    const timeoutRef = { current: null as NodeJS.Timeout | null };
+    const retryCountRef = { current: 0 };
+    const startTime = Date.now();
+    let isActive = true;
+
+    const cleanup = () => {
+      isActive = false;
+      if (pollIntervalRef.current) {
+        clearInterval(pollIntervalRef.current);
+        pollIntervalRef.current = null;
+      }
+      if (timeoutRef.current) {
+        clearTimeout(timeoutRef.current);
+        timeoutRef.current = null;
+      }
+      activePollingRef.current.delete(taskId);
+    };
+
+    const pollForStatus = async () => {
+      if (!isActive) return;
+
+      // Check if we've exceeded max poll time
+      if (Date.now() - startTime > maxPollTime) {
+        cleanup();
+        onError(`Scene ${sceneNumber}: Image generation timed out after 5 minutes. Please try again.`);
+        return;
+      }
+
+      try {
+        const status = await getStatus(taskId);
+        retryCountRef.current = 0; // Reset retry count on success
+
+        if (!isActive) return;
+
+        if (status.status === 'completed' && status.result) {
+          cleanup();
+          onComplete(status.result.image_url);
+        } else if (status.status === 'failed') {
+          cleanup();
+          const errorMsg = status.error || status.message || 'Image generation failed';
+          onError(`Scene ${sceneNumber}: ${errorMsg}`);
+        } else if (status.status === 'processing') {
+          if (onProgress) {
+            onProgress(status.progress || 0, status.message || 'Processing...');
+          }
+          // Continue polling
+        }
+
+      } catch (pollError: any) {
+        if (!isActive) return;
+
+        // Classify error type
+        const isNetworkError = pollError.code === 'ECONNABORTED' || 
+                               pollError.message?.includes('timeout') ||
+                               pollError.message?.includes('Network');
+        const isNotFoundError = pollError.response?.status === 404 || 
+                                pollError.message?.includes('404') || 
+                                pollError.message?.includes('not found');
+        const isServerError = pollError.response?.status >= 500;
+
+        if (isNotFoundError) {
+          // Task not found - stop polling immediately
+          cleanup();
+          onError(`Scene ${sceneNumber}: Image generation task was lost. Please try again.`);
+          return;
+        }
+
+        // For network/server errors, retry with exponential backoff
+        if ((isNetworkError || isServerError) && retryCountRef.current < maxRetries) {
+          retryCountRef.current += 1;
+          const backoffDelay = Math.min(
+            pollInterval * Math.pow(2, retryCountRef.current), 
+            30000 // Max 30s
+          );
+          
+          console.warn(
+            `[ImagePolling] Retrying poll for task ${taskId} ` +
+            `(${retryCountRef.current}/${maxRetries}) after ${backoffDelay}ms`
+          );
+          
+          // Clear current interval and retry after backoff
+          if (pollIntervalRef.current) {
+            clearInterval(pollIntervalRef.current);
+            pollIntervalRef.current = null;
+          }
+          
+          setTimeout(() => {
+            if (isActive && !pollIntervalRef.current) {
+              pollForStatus(); // Retry immediately
+              pollIntervalRef.current = setInterval(pollForStatus, pollInterval);
+            }
+          }, backoffDelay);
+        } else if (retryCountRef.current >= maxRetries) {
+          // Max retries exceeded
+          cleanup();
+          onError(
+            `Scene ${sceneNumber}: Failed to check image generation status after ${maxRetries} retries. ` +
+            `Please refresh and try again.`
+          );
+        }
+        // For other errors, continue polling (might be transient)
+      }
+    };
+
+    // Start polling immediately, then every pollInterval
+    pollForStatus();
+    pollIntervalRef.current = setInterval(pollForStatus, pollInterval);
+
+    // Set a timeout to stop polling after max time
+    timeoutRef.current = setTimeout(() => {
+      if (isActive) {
+        cleanup();
+        onError(`Scene ${sceneNumber}: Image generation timed out after 5 minutes. Please try again.`);
+      }
+    }, maxPollTime);
+
+    // Store cleanup function
+    activePollingRef.current.set(taskId, cleanup);
+
+    return cleanup;
+  }, []);
+
+  // Cleanup all polling on unmount
+  useEffect(() => {
+    return () => {
+      activePollingRef.current.forEach((cleanup) => cleanup());
+      activePollingRef.current.clear();
+    };
+  }, []);
+
+  const stopPolling = useCallback((taskId: string) => {
+    const cleanup = activePollingRef.current.get(taskId);
+    if (cleanup) {
+      cleanup();
+    }
+  }, []);
+
+  return { startPolling, stopPolling };
+};
+
--- a/frontend/src/components/YouTubeCreator/hooks/useSceneMedia.ts
+++ b/frontend/src/components/YouTubeCreator/hooks/useSceneMedia.ts
@@ -0,0 +1,80 @@
+// Hook for managing scene media (images and audio)
+import { useState, useEffect } from 'react';
+import { fetchMediaBlobUrl } from '../../../utils/fetchMediaBlobUrl';
+
+interface UseSceneMediaProps {
+  imageUrl?: string | null;
+  audioUrl?: string | null;
+}
+
+export const useSceneMedia = ({ imageUrl, audioUrl }: UseSceneMediaProps) => {
+  const [imageBlobUrl, setImageBlobUrl] = useState<string | null>(null);
+  const [imageLoading, setImageLoading] = useState(false);
+  const [audioBlobUrl, setAudioBlobUrl] = useState<string | null>(null);
+  const [audioLoading, setAudioLoading] = useState(false);
+
+  useEffect(() => {
+    console.log('[useSceneMedia] Image URL changed:', imageUrl);
+    let revokedUrl: string | null = null;
+
+    const fetchImage = async () => {
+      if (!imageUrl) {
+        console.log('[useSceneMedia] No imageUrl, clearing blob');
+        setImageBlobUrl(null);
+        return;
+      }
+
+      setImageLoading(true);
+      console.log('[useSceneMedia] Starting to fetch image blob for:', imageUrl);
+      try {
+        const blobUrl = await fetchMediaBlobUrl(imageUrl);
+        if (blobUrl) {
+          console.log('[useSceneMedia] Image blob loaded:', blobUrl);
+          setImageBlobUrl(blobUrl);
+          revokedUrl = blobUrl;
+          return;
+        }
+        // Fallback: use direct URL if blob could not be created (e.g., 404/401 handled upstream)
+        console.warn('[useSceneMedia] Blob URL unavailable, falling back to direct imageUrl');
+        setImageBlobUrl(imageUrl);
+      } catch (error) {
+        console.error('[useSceneMedia] Failed to load image:', error);
+        // Fallback to direct URL so UI still shows something while we investigate auth/serving
+        setImageBlobUrl(imageUrl);
+      } finally {
+        setImageLoading(false);
+      }
+    };
+
+    fetchImage();
+
+    return () => {
+      if (revokedUrl && revokedUrl.startsWith('blob:')) {
+        URL.revokeObjectURL(revokedUrl);
+      }
+    };
+  }, [imageUrl]);
+
+  useEffect(() => {
+    if (audioUrl) {
+      setAudioLoading(true);
+      fetchMediaBlobUrl(audioUrl)
+        .then(setAudioBlobUrl)
+        .catch(console.error)
+        .finally(() => setAudioLoading(false));
+    } else {
+      setAudioBlobUrl(null);
+    }
+
+    return () => {
+      if (audioBlobUrl) URL.revokeObjectURL(audioBlobUrl);
+    };
+  }, [audioUrl]);
+
+  return {
+    imageBlobUrl,
+    imageLoading,
+    audioBlobUrl,
+    audioLoading,
+  };
+};
--- a/frontend/src/components/YouTubeCreator/hooks/useVideoRenderQueue.ts
+++ b/frontend/src/components/YouTubeCreator/hooks/useVideoRenderQueue.ts
@@ -0,0 +1,232 @@
+import { useCallback, useEffect, useRef, useState } from 'react';
+import { youtubeApi, Scene, VideoPlan, TaskStatus } from '../../../services/youtubeApi';
+
+export type VideoJobStatus = 'idle' | 'running' | 'completed' | 'failed';
+
+export interface SceneVideoJob {
+  scene_number: number;
+  status: VideoJobStatus;
+  progress: number;
+  taskId?: string;
+  videoUrl?: string;
+  error?: string;
+}
+
+interface UseVideoRenderQueueOptions {
+  scenes: Scene[];
+  videoPlan: VideoPlan | null;
+  resolution: '480p' | '720p' | '1080p';
+  onSceneVideoReady?: (sceneNumber: number, videoUrl: string) => void;
+  onCombineReady?: (videoUrl: string) => void;
+}
+
+export const useVideoRenderQueue = ({
+  scenes,
+  videoPlan,
+  resolution,
+  onSceneVideoReady,
+  onCombineReady,
+}: UseVideoRenderQueueOptions) => {
+  const [jobs, setJobs] = useState<Record<number, SceneVideoJob>>({});
+  const [combineTaskId, setCombineTaskId] = useState<string | null>(null);
+  const [combineProgress, setCombineProgress] = useState<number>(0);
+  const [combineStatus, setCombineStatus] = useState<VideoJobStatus>('idle');
+  const pollingRef = useRef<Map<string, NodeJS.Timeout>>(new Map());
+
+  // Initialize jobs for current scenes
+  useEffect(() => {
+    setJobs((prev) => {
+      const next = { ...prev };
+      scenes.forEach((scene) => {
+        const sn = scene.scene_number;
+        if (!next[sn]) {
+          next[sn] = {
+            scene_number: sn,
+            status: scene.videoUrl ? 'completed' : 'idle',
+            progress: scene.videoUrl ? 100 : 0,
+            videoUrl: scene.videoUrl,
+          };
+        } else if (scene.videoUrl && next[sn].videoUrl !== scene.videoUrl) {
+          next[sn] = { ...next[sn], videoUrl: scene.videoUrl, status: 'completed', progress: 100 };
+        }
+      });
+      return next;
+    });
+  }, [scenes]);
+
+  const stopPolling = useCallback((taskId: string) => {
+    const timer = pollingRef.current.get(taskId);
+    if (timer) {
+      clearInterval(timer);
+      pollingRef.current.delete(taskId);
+    }
+  }, []);
+
+  const pollTask = useCallback(
+    (taskId: string, sceneNumber?: number, isCombine?: boolean) => {
+      const timer = setInterval(async () => {
+        try {
+          const status: TaskStatus = await youtubeApi.getRenderStatus(taskId);
+          const progress = status.progress ?? 0;
+
+          if (isCombine) {
+            setCombineProgress(progress);
+          } else if (sceneNumber !== undefined) {
+            setJobs((prev) => ({
+              ...prev,
+              [sceneNumber]: {
+                ...(prev[sceneNumber] || { scene_number: sceneNumber, status: 'running', progress }),
+                status: status.status === 'failed' ? 'failed' : status.status === 'completed' ? 'completed' : 'running',
+                progress,
+              },
+            }));
+          }
+
+          if (status.status === 'completed') {
+            stopPolling(taskId);
+            const result = status.result || {};
+
+            if (isCombine) {
+              const finalUrl = result.final_video_url || result.video_url;
+              if (finalUrl && onCombineReady) {
+                onCombineReady(finalUrl);
+              }
+              setCombineStatus('completed');
+            } else if (sceneNumber !== undefined) {
+              const videoUrl =
+                result.final_video_url ||
+                result.video_url ||
+                (Array.isArray(result.scene_results) && result.scene_results[0]?.video_url);
+              if (videoUrl && onSceneVideoReady) {
+                onSceneVideoReady(sceneNumber, videoUrl);
+              }
+              setJobs((prev) => ({
+                ...prev,
+                [sceneNumber]: {
+                  ...(prev[sceneNumber] || { scene_number: sceneNumber }),
+                  status: 'completed',
+                  progress: 100,
+                  videoUrl,
+                },
+              }));
+            }
+          } else if (status.status === 'failed') {
+            stopPolling(taskId);
+            const errorMsg = status.error || status.message || 'Video render failed';
+            if (isCombine) {
+              setCombineStatus('failed');
+            } else if (sceneNumber !== undefined) {
+              setJobs((prev) => ({
+                ...prev,
+                [sceneNumber]: {
+                  ...(prev[sceneNumber] || { scene_number: sceneNumber }),
+                  status: 'failed',
+                  progress: 0,
+                  error: errorMsg,
+                },
+              }));
+            }
+          }
+        } catch (err) {
+          stopPolling(taskId);
+          if (sceneNumber !== undefined) {
+            setJobs((prev) => ({
+              ...prev,
+              [sceneNumber]: {
+                ...(prev[sceneNumber] || { scene_number: sceneNumber }),
+                status: 'failed',
+                progress: 0,
+                error: err instanceof Error ? err.message : 'Video render failed',
+              },
+            }));
+          } else {
+            setCombineStatus('failed');
+          }
+        }
+      }, 3000);
+
+      pollingRef.current.set(taskId, timer);
+    },
+    [onCombineReady, onSceneVideoReady, stopPolling]
+  );
+
+  const runSceneVideo = useCallback(
+    async (scene: Scene, opts?: { generateAudio?: boolean }) => {
+      if (!videoPlan) {
+        throw new Error('Video plan is missing');
+      }
+      if (!scene.imageUrl) throw new Error('Scene image is required before video generation.');
+      if (!scene.audioUrl && !opts?.generateAudio) throw new Error('Scene audio is required before video generation.');
+
+      const sn = scene.scene_number;
+      setJobs((prev) => ({
+        ...prev,
+        [sn]: { scene_number: sn, status: 'running', progress: 5 },
+      }));
+
+      const resp = await youtubeApi.generateSceneVideo({
+        scene,
+        video_plan: videoPlan,
+        resolution,
+        generate_audio_enabled: Boolean(opts?.generateAudio),
+      });
+
+      if (resp.success && resp.task_id) {
+        setJobs((prev) => ({
+          ...prev,
+          [sn]: { ...(prev[sn] || { scene_number: sn }), status: 'running', taskId: resp.task_id, progress: 5 },
+        }));
+        pollTask(resp.task_id, sn, false);
+      } else {
+        setJobs((prev) => ({
+          ...prev,
+          [sn]: { scene_number: sn, status: 'failed', progress: 0, error: resp.message },
+        }));
+        throw new Error(resp.message || 'Failed to start scene video render');
+      }
+    },
+    [videoPlan, resolution, pollTask]
+  );
+
+  const combineVideos = useCallback(
+    async (videoUrls: string[], title?: string) => {
+      if (!videoUrls || videoUrls.length < 2) {
+        throw new Error('At least two scene videos are required to combine.');
+      }
+      setCombineStatus('running');
+      setCombineProgress(5);
+      const resp = await youtubeApi.combineVideos({
+        scene_video_urls: videoUrls,
+        resolution,
+        title,
+      });
+      if (resp.success && resp.task_id) {
+        setCombineTaskId(resp.task_id);
+        setCombineProgress(10);
+        pollTask(resp.task_id, undefined, true);
+      } else {
+        setCombineStatus('failed');
+        throw new Error(resp.message || 'Failed to start combine task');
+      }
+    },
+    [pollTask, resolution]
+  );
+
+  // Cleanup polling on unmount
+  useEffect(() => {
+    return () => {
+      pollingRef.current.forEach((timer) => clearInterval(timer));
+      pollingRef.current.clear();
+    };
+  }, []);
+
+  return {
+    jobs,
+    runSceneVideo,
+    combineVideos,
+    combineTaskId,
+    combineProgress,
+    combineStatus,
+  };
+};
+
--- a/frontend/src/components/YouTubeCreator/hooks/useYouTubeRenderQueue.ts
+++ b/frontend/src/components/YouTubeCreator/hooks/useYouTubeRenderQueue.ts
@@ -0,0 +1,268 @@
+import { useCallback, useEffect, useRef, useState } from 'react';
+import {
+  youtubeApi,
+  Scene,
+  SceneVideoRenderRequest,
+  TaskStatus,
+  VideoPlan,
+} from '../../../services/youtubeApi';
+
+type SceneStatus = 'idle' | 'running' | 'completed' | 'failed';
+
+interface SceneVideoState {
+  status: SceneStatus;
+  progress: number;
+  taskId?: string;
+  error?: string;
+  videoUrl?: string;
+}
+
+interface UseYouTubeRenderQueueParams {
+  scenes: Scene[];
+  videoPlan: VideoPlan | null;
+  resolution: '480p' | '720p' | '1080p';
+  onScenesUpdate: (updated: Scene[]) => void;
+  onError?: (msg: string) => void;
+  onInfo?: (msg: string) => void;
+  onSuccess?: (msg: string) => void;
+}
+
+interface UseYouTubeRenderQueueResult {
+  sceneStatuses: Record<number, SceneVideoState>;
+  finalVideoUrl: string | null;
+  combining: boolean;
+  combiningProgress: number;
+  combiningMessage: string;
+  runSceneVideo: (scene: Scene) => Promise<void>;
+  combineVideos: () => Promise<void>;
+}
+
+const POLL_MS = 3000;
+
+export function useYouTubeRenderQueue({
+  scenes,
+  videoPlan,
+  resolution,
+  onScenesUpdate,
+  onError,
+  onInfo,
+  onSuccess,
+}: UseYouTubeRenderQueueParams): UseYouTubeRenderQueueResult {
+  const [sceneStatuses, setSceneStatuses] = useState<Record<number, SceneVideoState>>({});
+  const [finalVideoUrl, setFinalVideoUrl] = useState<string | null>(null);
+  const [combining, setCombining] = useState(false);
+  const [combiningProgress, setCombiningProgress] = useState(0);
+  const [combiningMessage, setCombiningMessage] = useState('Combining videos...');
+  const pollingRefs = useRef<Map<string, NodeJS.Timeout>>(new Map());
+
+  const updateSceneStatus = useCallback((sceneNumber: number, updates: Partial<SceneVideoState>) => {
+    setSceneStatuses((prev) => ({
+      ...prev,
+      [sceneNumber]: {
+        ...prev[sceneNumber],
+        status: prev[sceneNumber]?.status || 'idle',
+        progress: prev[sceneNumber]?.progress || 0,
+        ...updates,
+      },
+    }));
+  }, []);
+
+  const clearPolling = useCallback((taskId: string) => {
+    const timers = pollingRefs.current;
+    timers.forEach((interval, key) => {
+      if (key === taskId) {
+        clearInterval(interval);
+        timers.delete(key);
+      }
+    });
+  }, []);
+
+  useEffect(() => {
+    return () => {
+      pollingRefs.current.forEach((interval) => clearInterval(interval));
+      pollingRefs.current.clear();
+    };
+  }, []);
+
+  const pollTask = useCallback(
+    (taskId: string, sceneNumber: number) => {
+      const interval = setInterval(async () => {
+        try {
+          const status: TaskStatus = await youtubeApi.getRenderStatus(taskId);
+          const progress = status.progress ?? 0;
+
+          if (status.status === 'completed') {
+            const videoUrl =
+              status.result?.video_url ||
+              status.result?.final_video_url ||
+              status.result?.scene_results?.[0]?.video_url ||
+              null;
+
+            updateSceneStatus(sceneNumber, {
+              status: 'completed',
+              progress: 100,
+              videoUrl: videoUrl || undefined,
+              taskId,
+              error: undefined,
+            });
+
+            if (videoUrl) {
+              const updatedScenes = scenes.map((s) =>
+                s.scene_number === sceneNumber ? { ...s, videoUrl } : s
+              );
+              onScenesUpdate(updatedScenes);
+            }
+
+            clearPolling(taskId);
+          } else if (status.status === 'failed') {
+            const errorMessage =
+              status.error ||
+              status.message ||
+              status.result?.error ||
+              'Video generation failed';
+            updateSceneStatus(sceneNumber, {
+              status: 'failed',
+              progress,
+              error: errorMessage,
+              taskId,
+            });
+            clearPolling(taskId);
+            onError?.(errorMessage);
+          } else {
+            updateSceneStatus(sceneNumber, {
+              status: 'running',
+              progress,
+              taskId,
+            });
+          }
+        } catch (err: any) {
+          const msg = err?.message || 'Failed to poll render status';
+          updateSceneStatus(sceneNumber, {
+            status: 'failed',
+            progress: 0,
+            error: msg,
+            taskId,
+          });
+          clearPolling(taskId);
+          onError?.(msg);
+        }
+      }, POLL_MS);
+
+      pollingRefs.current.set(taskId, interval);
+    },
+    [clearPolling, onError, onScenesUpdate, scenes, updateSceneStatus]
+  );
+
+  const runSceneVideo = useCallback(
+    async (scene: Scene) => {
+      if (!videoPlan) {
+        onError?.('Video plan is missing');
+        return;
+      }
+      const sn = scene.scene_number;
+      const existing = sceneStatuses[sn];
+      if (existing?.status === 'running') return;
+
+      updateSceneStatus(sn, { status: 'running', progress: 5, error: undefined });
+
+      const payload: SceneVideoRenderRequest = {
+        scene,
+        video_plan: videoPlan,
+        resolution,
+        generate_audio_enabled: false,
+        voice_id: 'Wise_Woman',
+      };
+
+      try {
+        const resp = await youtubeApi.generateSceneVideo(payload);
+        if (resp.success && resp.task_id) {
+          updateSceneStatus(sn, { status: 'running', progress: 5, taskId: resp.task_id });
+          pollTask(resp.task_id, sn);
+        } else {
+          const msg = resp.message || 'Failed to start scene render';
+          updateSceneStatus(sn, { status: 'failed', progress: 0, error: msg });
+          onError?.(msg);
+        }
+      } catch (err: any) {
+        const msg = err?.message || 'Failed to start scene render';
+        updateSceneStatus(sn, { status: 'failed', progress: 0, error: msg });
+        onError?.(msg);
+      }
+    },
+    [pollTask, resolution, sceneStatuses, updateSceneStatus, videoPlan, onError]
+  );
+
+  const combineVideos = useCallback(async () => {
+    const readyVideos = scenes
+      .filter((s) => s.enabled !== false && s.videoUrl)
+      .map((s) => s.videoUrl as string);
+
+    if (readyVideos.length < 2) {
+      onError?.('Need at least two scene videos to combine.');
+      return;
+    }
+
+    setCombining(true);
+    setCombiningProgress(5);
+    setCombiningMessage('Starting combination...');
+
+    try {
+      const resp = await youtubeApi.combineVideos({
+        scene_video_urls: readyVideos,
+        video_plan: videoPlan || undefined,
+        resolution,
+      });
+
+      if (!resp.success || !resp.task_id) {
+        const msg = resp.message || 'Failed to start video combine';
+        setCombining(false);
+        setCombiningProgress(0);
+        setCombiningMessage(msg);
+        onError?.(msg);
+        return;
+      }
+
+      const taskId = resp.task_id;
+      let done = false;
+      while (!done) {
+        await new Promise((r) => setTimeout(r, POLL_MS));
+        const status = await youtubeApi.getRenderStatus(taskId);
+        const progress = status.progress ?? 0;
+        setCombiningProgress(progress);
+        setCombiningMessage(status.message || 'Combining...');
+
+        if (status.status === 'completed') {
+          const url = status.result?.video_url || status.result?.final_video_url;
+          setFinalVideoUrl(url || null);
+          setCombining(false);
+          setCombiningProgress(100);
+          setCombiningMessage('Combined successfully');
+          onSuccess?.('Final video combined successfully');
+          done = true;
+        } else if (status.status === 'failed') {
+          const msg = status.error || status.message || 'Combine failed';
+          setCombining(false);
+          setCombiningMessage(msg);
+          onError?.(msg);
+          done = true;
+        }
+      }
+    } catch (err: any) {
+      const msg = err?.message || 'Combine failed';
+      setCombining(false);
+      setCombiningMessage(msg);
+      onError?.(msg);
+    }
+  }, [onError, resolution, scenes, videoPlan]);
+
+  return {
+    sceneStatuses,
+    finalVideoUrl,
+    combining,
+    combiningProgress,
+    combiningMessage,
+    runSceneVideo,
+    combineVideos,
+  };
+}
+
--- a/frontend/src/components/YouTubeCreator/shared/YouTubeImageGenerationModal.tsx
+++ b/frontend/src/components/YouTubeCreator/shared/YouTubeImageGenerationModal.tsx
@@ -0,0 +1,687 @@
+import React, { useState, useEffect } from "react";
+import {
+  Dialog,
+  DialogTitle,
+  DialogContent,
+  DialogActions,
+  Stack,
+  Box,
+  Typography,
+  TextField,
+  Select,
+  MenuItem,
+  FormControl,
+  InputLabel,
+  Divider,
+  alpha,
+  Tooltip,
+  IconButton,
+  Paper,
+} from "@mui/material";
+import {
+  Info as InfoIcon,
+  HelpOutline as HelpOutlineIcon,
+  Close as CloseIcon,
+  Palette as PaletteIcon,
+} from "@mui/icons-material";
+
+type PresetKey = "engagingHost" | "cinematicScene" | "professionalPresenter" | "casualCreator";
+
+const PRESETS: Record<
+  PresetKey,
+  {
+    title: string;
+    subtitle: string;
+    prompt: string;
+    style: "Auto" | "Fiction" | "Realistic";
+    renderingSpeed: "Default" | "Turbo" | "Quality";
+    aspectRatio: "1:1" | "16:9" | "9:16" | "4:3" | "3:4";
+  }
+> = {
+  engagingHost: {
+    title: "Engaging Host",
+    subtitle: "Dynamic presenter in engaging video environment",
+    prompt:
+      "Professional video host in modern studio, dynamic lighting, engaging facial expression, high energy atmosphere, camera-ready appearance, confident posture, vibrant background elements",
+    style: "Realistic",
+    renderingSpeed: "Quality",
+    aspectRatio: "16:9",
+  },
+  cinematicScene: {
+    title: "Cinematic Scene",
+    subtitle: "Dramatic, movie-like atmosphere with cinematic lighting",
+    prompt:
+      "Cinematic video scene, dramatic lighting, professional cinematography, engaging narrative atmosphere, high production value, cinematic depth of field, compelling visual storytelling",
+    style: "Realistic",
+    renderingSpeed: "Quality",
+    aspectRatio: "16:9",
+  },
+  professionalPresenter: {
+    title: "Professional Presenter",
+    subtitle: "Corporate-style presentation with clean, polished look",
+    prompt:
+      "Professional corporate presenter, clean business attire, polished appearance, neutral background, professional lighting, trustworthy demeanor, business presentation setting",
+    style: "Realistic",
+    renderingSpeed: "Quality",
+    aspectRatio: "16:9",
+  },
+  casualCreator: {
+    title: "Casual Creator",
+    subtitle: "Relaxed, approachable creator for vlogs and tutorials",
+    prompt:
+      "Casual content creator, friendly and approachable, comfortable setting, natural lighting, relaxed posture, authentic personality, everyday environment, genuine smile",
+    style: "Realistic",
+    renderingSpeed: "Quality",
+    aspectRatio: "16:9",
+  },
+};
+
+export interface YouTubeImageGenerationSettings {
+  prompt: string;
+  style: "Auto" | "Fiction" | "Realistic";
+  renderingSpeed: "Default" | "Turbo" | "Quality";
+  aspectRatio: "1:1" | "16:9" | "9:16" | "4:3" | "3:4";
+  model: "ideogram-v3-turbo" | "qwen-image";
+}
+
+interface YouTubeImageGenerationModalProps {
+  open: boolean;
+  onClose: () => void;
+  onGenerate: (settings: YouTubeImageGenerationSettings) => void;
+  initialPrompt: string;
+  initialStyle?: "Auto" | "Fiction" | "Realistic";
+  initialRenderingSpeed?: "Default" | "Turbo" | "Quality";
+  initialAspectRatio?: "1:1" | "16:9" | "9:16" | "4:3" | "3:4";
+  initialModel?: "ideogram-v3-turbo" | "qwen-image";
+  isGenerating?: boolean;
+  sceneTitle?: string;
+}
+
+export const YouTubeImageGenerationModal: React.FC<YouTubeImageGenerationModalProps> = ({
+  open,
+  onClose,
+  onGenerate,
+  initialPrompt,
+  initialStyle = "Realistic",
+  initialRenderingSpeed = "Quality",
+  initialAspectRatio = "16:9",
+  initialModel = "ideogram-v3-turbo",
+  isGenerating = false,
+  sceneTitle,
+}) => {
+  const [prompt, setPrompt] = useState(initialPrompt);
+  const [style, setStyle] = useState<"Auto" | "Fiction" | "Realistic">(initialStyle);
+  const [renderingSpeed, setRenderingSpeed] = useState<"Default" | "Turbo" | "Quality">(initialRenderingSpeed);
+  const [aspectRatio, setAspectRatio] = useState<"1:1" | "16:9" | "9:16" | "4:3" | "3:4">(initialAspectRatio);
+  const [model, setModel] = useState<"ideogram-v3-turbo" | "qwen-image">("ideogram-v3-turbo");
+
+  // Update state when initial values change
+  useEffect(() => {
+    setPrompt(initialPrompt);
+    setStyle(initialStyle);
+    setRenderingSpeed(initialRenderingSpeed);
+    setAspectRatio(initialAspectRatio);
+    setModel(initialModel);
+  }, [initialPrompt, initialStyle, initialRenderingSpeed, initialAspectRatio, initialModel]);
+
+  const handleGenerate = () => {
+    onGenerate({
+      prompt,
+      style,
+      renderingSpeed,
+      aspectRatio,
+      model,
+    });
+  };
+
+  const applyPreset = (presetKey: PresetKey) => {
+    const p = PRESETS[presetKey];
+    // Combine the preset prompt with current scene prompt context
+    setPrompt((current) => {
+      // If user already customized, append; otherwise replace with preset
+      if (!current || current.trim() === "" || current.trim() === initialPrompt.trim()) {
+        return `${initialPrompt}\n${p.prompt}`.trim();
+      }
+      return `${current}\n${p.prompt}`.trim();
+    });
+    setStyle(p.style);
+    setRenderingSpeed(p.renderingSpeed);
+    setAspectRatio(p.aspectRatio);
+  };
+
+  return (
+    <Dialog
+      open={open}
+      onClose={onClose}
+      maxWidth="md"
+      fullWidth
+      PaperProps={{
+        sx: {
+          background: alpha("#1a1a2e", 0.95),
+          backdropFilter: "blur(20px)",
+          border: "1px solid rgba(255,255,255,0.1)",
+          borderRadius: 4,
+        },
+      }}
+    >
+      <DialogTitle>
+        <Stack direction="row" justifyContent="space-between" alignItems="center">
+          <Box>
+            <Typography variant="h6" sx={{ color: "white", fontWeight: 600 }}>
+              Generate Scene Image
+            </Typography>
+            {sceneTitle && (
+              <Typography variant="body2" sx={{ color: "rgba(255,255,255,0.6)", mt: 1 }}>
+                Customize image generation for "{sceneTitle}"
+              </Typography>
+            )}
+          </Box>
+          <IconButton
+            onClick={onClose}
+            size="small"
+            sx={{ color: "rgba(255,255,255,0.7)" }}
+          >
+            <CloseIcon />
+          </IconButton>
+        </Stack>
+        <Typography variant="body2" sx={{ color: "rgba(255,255,255,0.6)", mt: 1 }}>
+          Customize image generation parameters for the perfect YouTube scene visual
+        </Typography>
+      </DialogTitle>
+
+      <DialogContent>
+        <Stack spacing={3} sx={{ mt: 1 }}>
+          {/* YouTube-optimized Presets */}
+          <Box>
+            <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 1 }}>
+              <PaletteIcon sx={{ color: "white", fontSize: "1.2rem" }} />
+              <Typography variant="subtitle1" sx={{ color: "white", fontWeight: 600 }}>
+                YouTube-ready presets
+              </Typography>
+              <Tooltip
+                title="Quickly apply a YouTube-optimized look. Each preset adjusts lighting, composition, and style while keeping your avatar consistent."
+                arrow
+              >
+                <IconButton size="small" sx={{ color: "rgba(255,255,255,0.5)" }}>
+                  <HelpOutlineIcon fontSize="small" />
+                </IconButton>
+              </Tooltip>
+            </Stack>
+            <Stack direction={{ xs: "column", sm: "row" }} spacing={1.5}>
+              {(
+                Object.entries(PRESETS) as Array<[PresetKey, (typeof PRESETS)[PresetKey]]>
+              ).map(([key, p]) => (
+                <Paper
+                  key={key}
+                  onClick={() => applyPreset(key)}
+                  sx={{
+                    p: 1.5,
+                    flex: 1,
+                    cursor: "pointer",
+                    backgroundColor: alpha("#ffffff", 0.04),
+                    border: "1px solid rgba(255,255,255,0.1)",
+                    borderRadius: 2,
+                    transition: "all 0.2s ease",
+                    "&:hover": {
+                      borderColor: "rgba(102,126,234,0.7)",
+                      boxShadow: "0 8px 24px rgba(0,0,0,0.25)",
+                      backgroundColor: alpha("#667eea", 0.08),
+                    },
+                  }}
+                >
+                  <Typography variant="subtitle2" sx={{ color: "white", fontWeight: 700 }}>
+                    {p.title}
+                  </Typography>
+                  <Typography variant="body2" sx={{ color: "rgba(255,255,255,0.7)", lineHeight: 1.5, mb: 0.75 }}>
+                    {p.subtitle}
+                  </Typography>
+                  <Stack direction="row" spacing={1} sx={{ color: "rgba(255,255,255,0.6)", fontSize: "0.8rem" }}>
+                    <Typography variant="caption">Style: {p.style}</Typography>
+                    <Typography variant="caption">Speed: {p.renderingSpeed}</Typography>
+                    <Typography variant="caption">AR: {p.aspectRatio}</Typography>
+                  </Stack>
+                </Paper>
+              ))}
+            </Stack>
+          </Box>
+
+          {/* Prompt Section */}
+          <Box>
+            <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 1 }}>
+              <Typography variant="subtitle1" sx={{ color: "white", fontWeight: 600 }}>
+                Visual Prompt
+              </Typography>
+              <Tooltip
+                title="Describe what you want to see in the generated image. Include scene context, visual elements, mood, and style preferences. The AI will use this along with your base avatar to create a consistent character in the YouTube scene."
+                arrow
+              >
+                <IconButton size="small" sx={{ color: "rgba(255,255,255,0.5)" }}>
+                  <HelpOutlineIcon fontSize="small" />
+                </IconButton>
+              </Tooltip>
+            </Stack>
+            <TextField
+              fullWidth
+              multiline
+              rows={4}
+              value={prompt}
+              onChange={(e) => setPrompt(e.target.value)}
+              placeholder="Describe the scene, visual elements, mood, and style..."
+              sx={{
+                "& .MuiOutlinedInput-root": {
+                  backgroundColor: alpha("#ffffff", 0.05),
+                  color: "white",
+                  "& fieldset": {
+                    borderColor: "rgba(255,255,255,0.2)",
+                  },
+                  "&:hover fieldset": {
+                    borderColor: "rgba(255,255,255,0.3)",
+                  },
+                  "&.Mui-focused fieldset": {
+                    borderColor: "#667eea",
+                  },
+                },
+                "& .MuiInputBase-input": {
+                  color: "white",
+                },
+              }}
+            />
+            <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.5)", mt: 0.5, display: "block" }}>
+              This prompt will be combined with scene context to generate your YouTube-ready image. Be specific about visual elements, lighting, and atmosphere.
+            </Typography>
+          </Box>
+
+          <Divider sx={{ borderColor: "rgba(255,255,255,0.1)" }} />
+
+          {/* Style Selection */}
+          <Box>
+            <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 1.5 }}>
+              <Typography variant="subtitle1" sx={{ color: "white", fontWeight: 600 }}>
+                Visual Style
+              </Typography>
+              <Tooltip
+                title="Determines the artistic style of the character generation. Auto lets the AI choose, Fiction creates more stylized/artistic characters, and Realistic produces photorealistic results optimized for video content."
+                arrow
+              >
+                <IconButton size="small" sx={{ color: "rgba(255,255,255,0.5)" }}>
+                  <HelpOutlineIcon fontSize="small" />
+                </IconButton>
+              </Tooltip>
+            </Stack>
+            <FormControl fullWidth>
+              <Select
+                value={style}
+                onChange={(e) => setStyle(e.target.value as "Auto" | "Fiction" | "Realistic")}
+                sx={{
+                  backgroundColor: alpha("#ffffff", 0.05),
+                  color: "white",
+                  "& .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "rgba(255,255,255,0.2)",
+                  },
+                  "&:hover .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "rgba(255,255,255,0.3)",
+                  },
+                  "&.Mui-focused .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "#667eea",
+                  },
+                  "& .MuiSvgIcon-root": {
+                    color: "rgba(255,255,255,0.7)",
+                  },
+                }}
+              >
+                <MenuItem value="Auto">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>Auto</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      AI automatically selects the best style
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+                <MenuItem value="Fiction">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>Fiction</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      Stylized, artistic character appearance
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+                <MenuItem value="Realistic">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>Realistic</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      Photorealistic, professional video appearance
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+              </Select>
+            </FormControl>
+            <Paper
+              sx={{
+                mt: 1.5,
+                p: 1.5,
+                backgroundColor: alpha("#667eea", 0.1),
+                border: "1px solid rgba(102,126,234,0.3)",
+                borderRadius: 2,
+              }}
+            >
+              <Stack direction="row" spacing={1}>
+                <InfoIcon sx={{ color: "#667eea", fontSize: "1.2rem", mt: 0.1 }} />
+                <Box>
+                  <Typography variant="body2" sx={{ color: "rgba(255,255,255,0.9)", fontWeight: 500, mb: 0.5 }}>
+                    Style Impact for YouTube:
+                  </Typography>
+                  <Typography variant="body2" sx={{ color: "rgba(255,255,255,0.7)", lineHeight: 1.6 }}>
+                    <strong>Auto:</strong> Best for most YouTube content, balances professionalism and engagement<br />
+                    <strong>Fiction:</strong> Great for creative content, gaming, or stylized presentations<br />
+                    <strong>Realistic:</strong> Ideal for educational, corporate, or professional YouTube channels
+                  </Typography>
+                </Box>
+              </Stack>
+            </Paper>
+          </Box>
+
+          {/* Rendering Speed */}
+          <Box>
+            <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 1.5 }}>
+              <Typography variant="subtitle1" sx={{ color: "white", fontWeight: 600 }}>
+                Generation Speed
+              </Typography>
+              <Tooltip
+                title="Controls the balance between generation speed, cost, and quality. Turbo is fastest and cheapest but lower quality. Quality is slowest and most expensive but produces the best results for professional YouTube content."
+                arrow
+              >
+                <IconButton size="small" sx={{ color: "rgba(255,255,255,0.5)" }}>
+                  <HelpOutlineIcon fontSize="small" />
+                </IconButton>
+              </Tooltip>
+            </Stack>
+            <FormControl fullWidth>
+              <Select
+                value={renderingSpeed}
+                onChange={(e) => setRenderingSpeed(e.target.value as "Default" | "Turbo" | "Quality")}
+                sx={{
+                  backgroundColor: alpha("#ffffff", 0.05),
+                  color: "white",
+                  "& .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "rgba(255,255,255,0.2)",
+                  },
+                  "&:hover .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "rgba(255,255,255,0.3)",
+                  },
+                  "&.Mui-focused .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "#667eea",
+                  },
+                  "& .MuiSvgIcon-root": {
+                    color: "rgba(255,255,255,0.7)",
+                  },
+                }}
+              >
+                <MenuItem value="Turbo">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>Turbo ⚡</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      Fastest (~10-20s) • Cheapest • Good for quick iterations
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+                <MenuItem value="Default">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>Default ⚖️</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      Balanced (~30-60s) • Moderate cost • Great for most YouTube content
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+                <MenuItem value="Quality">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>Quality ✨</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      Slowest (~60-120s) • Highest quality • Perfect for professional videos
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+              </Select>
+            </FormControl>
+            <Paper
+              sx={{
+                mt: 1.5,
+                p: 1.5,
+                backgroundColor: alpha("#10b981", 0.1),
+                border: "1px solid rgba(16,185,129,0.3)",
+                borderRadius: 2,
+              }}
+            >
+              <Stack direction="row" spacing={1}>
+                <InfoIcon sx={{ color: "#10b981", fontSize: "1.2rem", mt: 0.1 }} />
+                <Box>
+                  <Typography variant="body2" sx={{ color: "rgba(255,255,255,0.9)", fontWeight: 500, mb: 0.5 }}>
+                    Speed vs Quality for YouTube:
+                  </Typography>
+                  <Typography variant="body2" sx={{ color: "rgba(255,255,255,0.7)", lineHeight: 1.6 }}>
+                    <strong>Turbo:</strong> Use for testing and quick iterations (~$0.02/image)<br />
+                    <strong>Default:</strong> Best balance for regular YouTube production (~$0.04/image)<br />
+                    <strong>Quality:</strong> Use for high-stakes, professional content (~$0.08/image)
+                  </Typography>
+                </Box>
+              </Stack>
+            </Paper>
+          </Box>
+
+          {/* AI Model Selection */}
+          <Box>
+            <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 1.5 }}>
+              <Typography variant="subtitle1" sx={{ color: "white", fontWeight: 600 }}>
+                AI Model
+              </Typography>
+              <Tooltip
+                title="Choose the AI model for image generation. Different models offer different quality levels and costs. Ideogram V3 Turbo provides superior text rendering and photorealism."
+                arrow
+              >
+                <IconButton size="small" sx={{ color: "rgba(255,255,255,0.5)" }}>
+                  <HelpOutlineIcon fontSize="small" />
+                </IconButton>
+              </Tooltip>
+            </Stack>
+            <FormControl fullWidth>
+              <Select
+                value={model}
+                onChange={(e) => setModel(e.target.value as "ideogram-v3-turbo" | "qwen-image")}
+                sx={{
+                  backgroundColor: alpha("#ffffff", 0.05),
+                  color: "white",
+                  "& .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "rgba(255,255,255,0.2)",
+                  },
+                  "&:hover .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "rgba(255,255,255,0.3)",
+                  },
+                  "&.Mui-focused .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "#667eea",
+                  },
+                  "& .MuiSvgIcon-root": {
+                    color: "rgba(255,255,255,0.7)",
+                  },
+                }}
+              >
+                <MenuItem value="ideogram-v3-turbo">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>Ideogram V3 Turbo ✨</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      Photorealistic • Superior text rendering • $0.10/image
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+                <MenuItem value="qwen-image">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>Qwen Image ⚡</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      Fast generation • High quality • $0.05/image
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+              </Select>
+            </FormControl>
+            <Paper
+              sx={{
+                mt: 1.5,
+                p: 1.5,
+                backgroundColor: alpha("#10b981", 0.1),
+                border: "1px solid rgba(16,185,129,0.3)",
+                borderRadius: 2,
+              }}
+            >
+              <Stack direction="row" spacing={1}>
+                <InfoIcon sx={{ color: "#10b981", fontSize: "1.2rem", mt: 0.1 }} />
+                <Box>
+                  <Typography variant="body2" sx={{ color: "rgba(255,255,255,0.9)", fontWeight: 500, mb: 0.5 }}>
+                    Model Recommendations:
+                  </Typography>
+                  <Typography variant="body2" sx={{ color: "rgba(255,255,255,0.7)", lineHeight: 1.6 }}>
+                    <strong>Ideogram V3 Turbo:</strong> Best for professional YouTube content with text, logos, or detailed scenes<br />
+                    <strong>Qwen Image:</strong> Great for fast iterations and general content creation
+                  </Typography>
+                </Box>
+              </Stack>
+            </Paper>
+          </Box>
+
+          {/* Aspect Ratio */}
+          <Box>
+            <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 1.5 }}>
+              <Typography variant="subtitle1" sx={{ color: "white", fontWeight: 600 }}>
+                Aspect Ratio
+              </Typography>
+              <Tooltip
+                title="The width-to-height ratio of the generated image. Choose based on your YouTube format: 16:9 for standard videos, 9:16 for Shorts/mobile, 1:1 for thumbnails, or other formats as needed."
+                arrow
+              >
+                <IconButton size="small" sx={{ color: "rgba(255,255,255,0.5)" }}>
+                  <HelpOutlineIcon fontSize="small" />
+                </IconButton>
+              </Tooltip>
+            </Stack>
+            <FormControl fullWidth>
+              <Select
+                value={aspectRatio}
+                onChange={(e) => setAspectRatio(e.target.value as "1:1" | "16:9" | "9:16" | "4:3" | "3:4")}
+                sx={{
+                  backgroundColor: alpha("#ffffff", 0.05),
+                  color: "white",
+                  "& .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "rgba(255,255,255,0.2)",
+                  },
+                  "&:hover .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "rgba(255,255,255,0.3)",
+                  },
+                  "&.Mui-focused .MuiOutlinedInput-notchedOutline": {
+                    borderColor: "#667eea",
+                  },
+                  "& .MuiSvgIcon-root": {
+                    color: "rgba(255,255,255,0.7)",
+                  },
+                }}
+              >
+                <MenuItem value="16:9">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>16:9 (Widescreen)</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      Standard YouTube videos, best for main content
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+                <MenuItem value="9:16">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>9:16 (Vertical)</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      YouTube Shorts, TikTok, Instagram Stories
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+                <MenuItem value="1:1">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>1:1 (Square)</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      Thumbnails, Instagram posts, profile images
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+                <MenuItem value="4:3">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>4:3 (Traditional)</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      Classic format, presentations, older content
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+                <MenuItem value="3:4">
+                  <Stack>
+                    <Typography sx={{ color: "white" }}>3:4 (Portrait)</Typography>
+                    <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.6)" }}>
+                      LinkedIn, some social media formats
+                    </Typography>
+                  </Stack>
+                </MenuItem>
+              </Select>
+            </FormControl>
+            <Paper
+              sx={{
+                mt: 1.5,
+                p: 1.5,
+                backgroundColor: alpha("#f59e0b", 0.1),
+                border: "1px solid rgba(245,158,11,0.3)",
+                borderRadius: 2,
+              }}
+            >
+              <Stack direction="row" spacing={1}>
+                <InfoIcon sx={{ color: "#f59e0b", fontSize: "1.2rem", mt: 0.1 }} />
+                <Box>
+                  <Typography variant="body2" sx={{ color: "rgba(255,255,255,0.9)", fontWeight: 500, mb: 0.5 }}>
+                    YouTube Format Recommendations:
+                  </Typography>
+                  <Typography variant="body2" sx={{ color: "rgba(255,255,255,0.7)", lineHeight: 1.6 }}>
+                    <strong>16:9:</strong> Standard videos (recommended for most content)<br />
+                    <strong>9:16:</strong> YouTube Shorts and mobile-optimized content<br />
+                    <strong>1:1:</strong> Thumbnails and square-format promotional content
+                  </Typography>
+                </Box>
+              </Stack>
+            </Paper>
+          </Box>
+        </Stack>
+      </DialogContent>
+
+      <DialogActions sx={{ p: 3, pt: 2 }}>
+        <IconButton
+          onClick={onClose}
+          disabled={isGenerating}
+          sx={{ color: "rgba(255,255,255,0.7)", mr: 1 }}
+        >
+          <CloseIcon />
+        </IconButton>
+        <Box sx={{ flex: 1 }} />
+        <IconButton
+          onClick={handleGenerate}
+          disabled={isGenerating || !prompt.trim()}
+          sx={{
+            backgroundColor: isGenerating ? "rgba(255,255,255,0.1)" : "#667eea",
+            color: "white",
+            "&:hover": {
+              backgroundColor: isGenerating ? "rgba(255,255,255,0.1)" : "#5a6fd8",
+            },
+            "&:disabled": {
+              backgroundColor: "rgba(255,255,255,0.1)",
+              color: "rgba(255,255,255,0.3)",
+            },
+            px: 3,
+            py: 1,
+            borderRadius: 2,
+          }}
+        >
+          <Typography variant="button" sx={{ fontWeight: 600 }}>
+            {isGenerating ? "Generating..." : "Generate Image"}
+          </Typography>
+        </IconButton>
+      </DialogActions>
+    </Dialog>
+  );
+};
--- a/frontend/src/components/YouTubeCreator/shared/index.ts
+++ b/frontend/src/components/YouTubeCreator/shared/index.ts
@@ -0,0 +1,2 @@
+export { YouTubeImageGenerationModal } from './YouTubeImageGenerationModal';
+export type { YouTubeImageGenerationSettings } from './YouTubeImageGenerationModal';
--- a/frontend/src/components/YouTubeCreator/utils/operationHelpers.ts
+++ b/frontend/src/components/YouTubeCreator/utils/operationHelpers.ts
@@ -195,3 +195,4 @@ export function buildImageGenerationOperation(
  };
 }

+
--- a/frontend/src/components/shared/AudioSettingsModal.tsx
+++ b/frontend/src/components/shared/AudioSettingsModal.tsx
@@ -0,0 +1,648 @@
+import React, { useEffect, useState } from "react";
+import {
+  Dialog,
+  DialogTitle,
+  DialogContent,
+  DialogActions,
+  Stack,
+  Box,
+  Typography,
+  Slider,
+  Select,
+  MenuItem,
+  FormControl,
+  InputLabel,
+  FormControlLabel,
+  Checkbox,
+  Tooltip,
+  IconButton,
+  alpha,
+  TextField,
+} from "@mui/material";
+import { HelpOutline as HelpOutlineIcon, Close as CloseIcon, VolumeUp } from "@mui/icons-material";
+import { Button } from "@mui/material";
+
+export type AudioGenerationSettings = {
+  voiceId: string;
+  speed: number;
+  volume: number;
+  pitch: number;
+  emotion: string;
+  englishNormalization: boolean;
+  sampleRate?: number;
+  bitrate: number;
+  channel: "1" | "2";
+  format: "mp3" | "wav" | "pcm" | "flac";
+  languageBoost?: string;
+  enableSyncMode: boolean;
+};
+
+interface AudioSettingsModalProps {
+  open: boolean;
+  onClose: () => void;
+  onApplySettings: (settings: AudioGenerationSettings) => void;
+  initialSettings: AudioGenerationSettings;
+  isGenerating?: boolean;
+  sceneTitle?: string;
+  isRegenerating?: boolean;
+}
+
+// Voice options from minimax/speech-02-hd with personality descriptions
+const VOICE_OPTIONS = [
+  { id: "Wise_Woman", name: "Wise Woman", personality: "Authoritative, trustworthy female voice - perfect for educational content and expert narration" },
+  { id: "Friendly_Person", name: "Friendly Person", personality: "Warm, approachable voice - great for welcoming introductions and customer-facing content" },
+  { id: "Inspirational_girl", name: "Inspirational Girl", personality: "Motivational, uplifting female voice - ideal for inspirational and motivational content" },
+  { id: "Deep_Voice_Man", name: "Deep Voice Man", personality: "Powerful, commanding male voice - excellent for serious topics and authoritative delivery" },
+  { id: "Calm_Woman", name: "Calm Woman", personality: "Soothing, composed female voice - perfect for meditation, relaxation, or sensitive topics" },
+  { id: "Casual_Guy", name: "Casual Guy", personality: "Relaxed, conversational male voice - great for vlogs, tutorials, and informal content" },
+  { id: "Lively_Girl", name: "Lively Girl", personality: "Energetic, enthusiastic female voice - ideal for exciting announcements and upbeat content" },
+  { id: "Patient_Man", name: "Patient Man", personality: "Gentle, understanding male voice - perfect for explanations and patient guidance" },
+  { id: "Young_Knight", name: "Young Knight", personality: "Brave, confident male voice - great for adventure, gaming, and heroic narratives" },
+  { id: "Determined_Man", name: "Determined Man", personality: "Strong, resolute male voice - excellent for motivational speeches and determined delivery" },
+  { id: "Lovely_Girl", name: "Lovely Girl", personality: "Sweet, charming female voice - ideal for storytelling and gentle narratives" },
+  { id: "Decent_Boy", name: "Decent Boy", personality: "Honest, sincere male voice - perfect for testimonials and personal stories" },
+  { id: "Imposing_Manner", name: "Imposing Manner", personality: "Formal, dignified male voice - great for corporate content and official announcements" },
+  { id: "Elegant_Man", name: "Elegant Man", personality: "Refined, sophisticated male voice - ideal for luxury, premium content" },
+  { id: "Abbess", name: "Abbess", personality: "Spiritual, serene female voice - perfect for meditation, philosophy, or contemplative content" },
+  { id: "Sweet_Girl_2", name: "Sweet Girl 2", personality: "Gentle, melodic female voice - excellent for children's content and soft storytelling" },
+  { id: "Exuberant_Girl", name: "Exuberant Girl", personality: "Joyful, expressive female voice - ideal for celebrations and happy announcements" },
+];
+
+const EMOTION_OPTIONS = ["happy", "sad", "angry", "fearful", "disgusted", "surprised", "neutral"];
+
+const SAMPLE_RATE_OPTIONS = [8000, 16000, 22050, 24000, 32000, 44100];
+const BITRATE_OPTIONS = [32000, 64000, 128000, 256000];
+const LANGUAGE_BOOST_OPTIONS = [
+  "auto",
+  "English",
+  "Chinese",
+  "Chinese,Yue",
+  "Arabic",
+  "Russian",
+  "Spanish",
+  "French",
+  "Portuguese",
+  "German",
+  "Turkish",
+  "Dutch",
+  "Ukrainian",
+  "Vietnamese",
+  "Indonesian",
+  "Japanese",
+  "Italian",
+  "Korean",
+  "Thai",
+  "Polish",
+  "Romanian",
+  "Greek",
+  "Czech",
+  "Finnish",
+  "Hindi",
+];
+
+export const AudioSettingsModal: React.FC<AudioSettingsModalProps> = ({
+  open,
+  onClose,
+  onApplySettings,
+  initialSettings,
+  isGenerating = false,
+  sceneTitle,
+  isRegenerating = false,
+}) => {
+  const [settings, setSettings] = useState<AudioGenerationSettings>(initialSettings);
+
+  useEffect(() => {
+    setSettings(initialSettings);
+  }, [initialSettings]);
+
+  const handleApply = () => {
+    onApplySettings(settings);
+  };
+
+  return (
+    <Dialog
+      open={open}
+      onClose={onClose}
+      maxWidth="md"
+      fullWidth
+      PaperProps={{
+        sx: {
+          background: "linear-gradient(135deg, #667eea 0%, #764ba2 100%)",
+          color: "white",
+        },
+      }}
+    >
+      <DialogTitle>
+        <Stack direction="row" justifyContent="space-between" alignItems="center">
+          <Box>
+            <Typography variant="h6" sx={{ fontWeight: 600, mb: 0.5 }}>
+              {isRegenerating ? 'Regenerate Audio' : 'Generate Audio'} - Voice Settings
+            </Typography>
+            {sceneTitle && (
+              <Typography variant="body2" sx={{ opacity: 0.8 }}>
+                Configure voice settings for "{sceneTitle}"
+              </Typography>
+            )}
+          </Box>
+          <IconButton onClick={onClose} size="small" sx={{ color: "rgba(255,255,255,0.7)" }}>
+            <CloseIcon />
+          </IconButton>
+        </Stack>
+        <Typography variant="body2" sx={{ opacity: 0.7, mt: 1 }}>
+          {isRegenerating
+            ? 'Customize voice settings to regenerate your audio narration with different characteristics.'
+            : 'Choose voice settings to generate high-quality audio narration for your scene.'
+          }
+        </Typography>
+      </DialogTitle>
+
+      <DialogContent>
+        <Stack spacing={3} sx={{ mt: 1 }}>
+          {/* Voice Selection */}
+          <Box>
+            <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 1 }}>
+              <Typography variant="subtitle1" sx={{ fontWeight: 600 }}>
+                Voice Selection
+              </Typography>
+              <Tooltip title={
+                <Box>
+                  <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
+                    Voice Selection Guide
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                    Choose a voice that matches your content's personality and target audience.
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                    • <strong>YouTube/Vlogging</strong>: Casual Guy (default), Friendly Person - conversational and engaging
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                    • <strong>Educational/Tutorials</strong>: Wise Woman, Deep Voice Man - authoritative and trustworthy
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                    • <strong>Motivational</strong>: Inspirational Girl, Determined Man - energetic and inspiring
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                    • <strong>Relaxing/Storytelling</strong>: Calm Woman, Lovely Girl - soothing and gentle
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block' }}>
+                    <strong>Default:</strong> Casual Guy - optimized for engaging YouTube narration.
+                  </Typography>
+                </Box>
+              } arrow placement="right">
+                <IconButton size="small" sx={{ color: "rgba(255,255,255,0.5)" }}>
+                  <HelpOutlineIcon fontSize="small" />
+                </IconButton>
+              </Tooltip>
+            </Stack>
+            <FormControl fullWidth>
+              <Select
+                value={settings.voiceId}
+                onChange={(e) => setSettings({ ...settings, voiceId: e.target.value })}
+                sx={{
+                  backgroundColor: alpha("#ffffff", 0.1),
+                  color: "white",
+                  "& .MuiOutlinedInput-notchedOutline": { borderColor: "rgba(255,255,255,0.3)" },
+                  "&:hover .MuiOutlinedInput-notchedOutline": { borderColor: "rgba(255,255,255,0.4)" },
+                  "&.Mui-focused .MuiOutlinedInput-notchedOutline": { borderColor: "#ffffff" },
+                  "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                }}
+              >
+                {VOICE_OPTIONS.map((voice) => (
+                  <MenuItem key={voice.id} value={voice.id}>
+                    <Box>
+                      <Typography variant="body2" sx={{ fontWeight: 600, color: "white" }}>
+                        {voice.name}
+                      </Typography>
+                      <Typography variant="caption" sx={{ color: "rgba(255,255,255,0.7)", display: "block", fontSize: "0.7rem" }}>
+                        {voice.personality}
+                      </Typography>
+                    </Box>
+                  </MenuItem>
+                ))}
+              </Select>
+            </FormControl>
+          </Box>
+
+          {/* Speed / Volume / Pitch */}
+          <Stack direction={{ xs: "column", sm: "row" }} spacing={2}>
+            <Box sx={{ flex: 1 }}>
+              <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 0.5 }}>
+                <Typography variant="subtitle2" sx={{ fontWeight: 600 }}>
+                  Speaking Speed ({settings.speed.toFixed(2)})
+                </Typography>
+                <Tooltip title={
+                  <Box>
+                    <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
+                      Natural Speaking Pace
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                      • <strong>0.8-1.0</strong>: Slow, deliberate (educational, complex topics)
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                      • <strong>1.1-1.2</strong>: Natural, engaging (recommended for YouTube)
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                      • <strong>1.3-1.5</strong>: Fast, energetic (exciting, promotional content)
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mt: 0.5 }}>
+                      <strong>Default:</strong> 1.15 - Optimized for engaging YouTube narration.
+                    </Typography>
+                  </Box>
+                } arrow placement="right">
+                  <HelpOutlineIcon fontSize="small" sx={{ color: "rgba(255,255,255,0.5)" }} />
+                </Tooltip>
+              </Stack>
+              <Slider
+                value={settings.speed}
+                min={0.5}
+                max={2.0}
+                step={0.05}
+                onChange={(_, v) => setSettings({ ...settings, speed: v as number })}
+                sx={{ color: "#4ade80" }}
+              />
+              <Typography variant="caption" sx={{ opacity: 0.7 }}>
+                0.5 = Slower (narrative) • 1.0 = Normal • 2.0 = Faster (energetic)
+              </Typography>
+            </Box>
+
+            <Box sx={{ flex: 1 }}>
+              <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 0.5 }}>
+                <Typography variant="subtitle2" sx={{ fontWeight: 600 }}>
+                  Volume Level ({settings.volume.toFixed(1)})
+                </Typography>
+                <Tooltip title={
+                  <Box>
+                    <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
+                      Audio Loudness
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                      • <strong>0.1-0.5</strong>: Very soft, intimate whisper
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                      • <strong>0.8-1.2</strong>: Normal speaking volume
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                      • <strong>1.5-10.0</strong>: Loud, commanding presence
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mt: 0.5 }}>
+                      <strong>Note:</strong> Very high volumes may cause distortion.
+                    </Typography>
+                  </Box>
+                } arrow placement="right">
+                  <HelpOutlineIcon fontSize="small" sx={{ color: "rgba(255,255,255,0.5)" }} />
+                </Tooltip>
+              </Stack>
+              <Slider
+                value={settings.volume}
+                min={0.1}
+                max={10.0}
+                step={0.1}
+                onChange={(_, v) => setSettings({ ...settings, volume: v as number })}
+                sx={{ color: "#fbbf24" }}
+              />
+              <Typography variant="caption" sx={{ opacity: 0.7 }}>
+                0.1 = Very soft • 1.0 = Normal • 10.0 = Very loud
+              </Typography>
+            </Box>
+
+            <Box sx={{ flex: 1 }}>
+              <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 0.5 }}>
+                <Typography variant="subtitle2" sx={{ fontWeight: 600 }}>
+                  Voice Pitch ({settings.pitch})
+                </Typography>
+                <Tooltip title={
+                  <Box>
+                    <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
+                      Voice Tone & Character
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                      • <strong>-12 to -6</strong>: Deep, authoritative (male voices, serious content)
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                      • <strong>-2 to +2</strong>: Natural, conversational range
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                      • <strong>+3 to +12</strong>: Bright, energetic (female voices, upbeat content)
+                    </Typography>
+                    <Typography variant="caption" sx={{ display: 'block', mt: 0.5 }}>
+                      <strong>Tip:</strong> Small adjustments (±2) sound most natural.
+                    </Typography>
+                  </Box>
+                } arrow placement="right">
+                  <HelpOutlineIcon fontSize="small" sx={{ color: "rgba(255,255,255,0.5)" }} />
+                </Tooltip>
+              </Stack>
+              <Slider
+                value={settings.pitch}
+                min={-12}
+                max={12}
+                step={0.5}
+                onChange={(_, v) => setSettings({ ...settings, pitch: v as number })}
+                sx={{ color: "#f87171" }}
+              />
+              <Typography variant="caption" sx={{ opacity: 0.7 }}>
+                -12 = Very deep • 0 = Normal • +12 = Very high
+              </Typography>
+            </Box>
+          </Stack>
+
+          {/* Emotion */}
+          <Box>
+            <Stack direction="row" spacing={1} alignItems="center" sx={{ mb: 1 }}>
+              <Typography variant="subtitle1" sx={{ fontWeight: 600 }}>
+                Emotional Delivery
+              </Typography>
+              <Tooltip title={
+                <Box>
+                  <Typography variant="body2" sx={{ fontWeight: 600, mb: 0.5 }}>
+                    Voice Emotional Expression
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                    Choose the emotional tone that matches your content:
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                    • <strong>Happy</strong>: Warm, enthusiastic delivery
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                    • <strong>Neutral</strong>: Professional, straightforward tone
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                    • <strong>Sad</strong>: Somber, reflective delivery
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block', mb: 0.5 }}>
+                    • <strong>Angry</strong>: Forceful, urgent tone (use sparingly)
+                  </Typography>
+                  <Typography variant="caption" sx={{ display: 'block', mt: 0.5 }}>
+                    <strong>Recommendation:</strong> Happy/Neutral for most educational content.
+                  </Typography>
+                </Box>
+              } arrow placement="right">
+                <IconButton size="small" sx={{ color: "rgba(255,255,255,0.5)" }}>
+                  <HelpOutlineIcon fontSize="small" />
+                </IconButton>
+              </Tooltip>
+            </Stack>
+            <FormControl fullWidth>
+              <Select
+                value={settings.emotion}
+                onChange={(e) => setSettings({ ...settings, emotion: e.target.value })}
+                sx={{
+                  backgroundColor: alpha("#ffffff", 0.1),
+                  color: "white",
+                  "& .MuiOutlinedInput-notchedOutline": { borderColor: "rgba(255,255,255,0.3)" },
+                  "&:hover .MuiOutlinedInput-notchedOutline": { borderColor: "rgba(255,255,255,0.4)" },
+                  "&.Mui-focused .MuiOutlinedInput-notchedOutline": { borderColor: "#ffffff" },
+                  "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                }}
+              >
+                {EMOTION_OPTIONS.map((emotion) => (
+                  <MenuItem key={emotion} value={emotion}>
+                    {emotion.charAt(0).toUpperCase() + emotion.slice(1)}
+                  </MenuItem>
+                ))}
+              </Select>
+            </FormControl>
+            <Typography variant="caption" sx={{ opacity: 0.7, mt: 0.5, display: "block" }}>
+              Select the emotional tone. "Happy" provides natural, engaging delivery for most YouTube content.
+            </Typography>
+          </Box>
+
+          {/* Language & Normalization */}
+          <Stack direction={{ xs: "column", sm: "row" }} spacing={2}>
+            <Box sx={{ flex: 1 }}>
+              <FormControlLabel
+                control={
+                  <Checkbox
+                    checked={settings.englishNormalization}
+                    onChange={(e) => setSettings({ ...settings, englishNormalization: e.target.checked })}
+                    sx={{ color: "rgba(255,255,255,0.7)" }}
+                  />
+                }
+                label={
+                  <Typography variant="body2" sx={{ color: "white" }}>
+                    English text normalization
+                  </Typography>
+                }
+              />
+              <Typography variant="caption" sx={{ opacity: 0.7 }}>
+                Improves pronunciation of numbers (42 → "forty-two"), dates, currencies, and technical terms. Recommended for most English content.
+              </Typography>
+            </Box>
+
+            <Box sx={{ flex: 1 }}>
+              <TextField
+                select
+                fullWidth
+                label="Language boost"
+                value={settings.languageBoost || "auto"}
+                onChange={(e) => setSettings({ ...settings, languageBoost: e.target.value })}
+                InputLabelProps={{ sx: { color: "rgba(255,255,255,0.7)" } }}
+                sx={{
+                  "& .MuiOutlinedInput-root": {
+                    backgroundColor: alpha("#ffffff", 0.1),
+                    color: "white",
+                    "& fieldset": { borderColor: "rgba(255,255,255,0.3)" },
+                    "&:hover fieldset": { borderColor: "rgba(255,255,255,0.4)" },
+                    "&.Mui-focused fieldset": { borderColor: "#ffffff" },
+                    "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                  },
+                }}
+              >
+                {LANGUAGE_BOOST_OPTIONS.map((option) => (
+                  <MenuItem key={option} value={option}>
+                    {option}
+                  </MenuItem>
+                ))}
+              </TextField>
+              <Typography variant="caption" sx={{ opacity: 0.7, mt: 0.5, display: "block" }}>
+                Improves pronunciation accuracy for content in specific languages or regional dialects. Use "auto" for automatic detection.
+              </Typography>
+            </Box>
+          </Stack>
+
+          {/* Quality Settings */}
+          <Stack direction={{ xs: "column", sm: "row" }} spacing={2}>
+            <Box sx={{ flex: 1 }}>
+              <TextField
+                select
+                fullWidth
+                label="Sample rate"
+                value={settings.sampleRate || 24000}
+                onChange={(e) => setSettings({ ...settings, sampleRate: Number(e.target.value) })}
+                InputLabelProps={{ sx: { color: "rgba(255,255,255,0.7)" } }}
+                sx={{
+                  "& .MuiOutlinedInput-root": {
+                    backgroundColor: alpha("#ffffff", 0.1),
+                    color: "white",
+                    "& fieldset": { borderColor: "rgba(255,255,255,0.3)" },
+                    "&:hover fieldset": { borderColor: "rgba(255,255,255,0.4)" },
+                    "&.Mui-focused fieldset": { borderColor: "#ffffff" },
+                    "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                  },
+                }}
+              >
+                {SAMPLE_RATE_OPTIONS.map((rate) => (
+                  <MenuItem key={rate} value={rate}>
+                    {rate} Hz
+                  </MenuItem>
+                ))}
+              </TextField>
+              <Typography variant="caption" sx={{ opacity: 0.7, mt: 0.5, display: "block" }}>
+                Sample rate affects audio clarity. 24kHz is optimal for voice content - higher values increase file size without noticeable improvement.
+              </Typography>
+            </Box>
+
+            <Box sx={{ flex: 1 }}>
+              <TextField
+                select
+                fullWidth
+                label="Bitrate"
+                value={settings.bitrate}
+                onChange={(e) => setSettings({ ...settings, bitrate: Number(e.target.value) })}
+                InputLabelProps={{ sx: { color: "rgba(255,255,255,0.7)" } }}
+                sx={{
+                  "& .MuiOutlinedInput-root": {
+                    backgroundColor: alpha("#ffffff", 0.1),
+                    color: "white",
+                    "& fieldset": { borderColor: "rgba(255,255,255,0.3)" },
+                    "&:hover fieldset": { borderColor: "rgba(255,255,255,0.4)" },
+                    "&.Mui-focused fieldset": { borderColor: "#ffffff" },
+                    "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                  },
+                }}
+              >
+                {BITRATE_OPTIONS.map((bitrate) => (
+                  <MenuItem key={bitrate} value={bitrate}>
+                    {bitrate / 1000} kbps
+                  </MenuItem>
+                ))}
+              </TextField>
+              <Typography variant="caption" sx={{ opacity: 0.7, mt: 0.5, display: "block" }}>
+                Audio quality vs file size trade-off. 128kbps provides excellent voice quality with reasonable file sizes.
+              </Typography>
+            </Box>
+          </Stack>
+
+          {/* Format & Channel */}
+          <Stack direction={{ xs: "column", sm: "row" }} spacing={2}>
+            <Box sx={{ flex: 1 }}>
+              <TextField
+                select
+                fullWidth
+                label="Channel"
+                value={settings.channel}
+                onChange={(e) => setSettings({ ...settings, channel: e.target.value as "1" | "2" })}
+                InputLabelProps={{ sx: { color: "rgba(255,255,255,0.7)" } }}
+                sx={{
+                  "& .MuiOutlinedInput-root": {
+                    backgroundColor: alpha("#ffffff", 0.1),
+                    color: "white",
+                    "& fieldset": { borderColor: "rgba(255,255,255,0.3)" },
+                    "&:hover fieldset": { borderColor: "rgba(255,255,255,0.4)" },
+                    "&.Mui-focused fieldset": { borderColor: "#ffffff" },
+                    "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                  },
+                }}
+              >
+                <MenuItem value="1">Mono (smaller files, standard for voice)</MenuItem>
+                <MenuItem value="2">Stereo (wider sound, larger files)</MenuItem>
+              </TextField>
+            </Box>
+
+            <Box sx={{ flex: 1 }}>
+              <TextField
+                select
+                fullWidth
+                label="Format"
+                value={settings.format}
+                onChange={(e) => setSettings({ ...settings, format: e.target.value as "mp3" | "wav" | "pcm" | "flac" })}
+                InputLabelProps={{ sx: { color: "rgba(255,255,255,0.7)" } }}
+                sx={{
+                  "& .MuiOutlinedInput-root": {
+                    backgroundColor: alpha("#ffffff", 0.1),
+                    color: "white",
+                    "& fieldset": { borderColor: "rgba(255,255,255,0.3)" },
+                    "&:hover fieldset": { borderColor: "rgba(255,255,255,0.4)" },
+                    "&.Mui-focused fieldset": { borderColor: "#ffffff" },
+                    "& .MuiSvgIcon-root": { color: "rgba(255,255,255,0.7)" },
+                  },
+                }}
+              >
+                <MenuItem value="mp3">MP3 (compressed, widely supported)</MenuItem>
+                <MenuItem value="wav">WAV (uncompressed, highest quality)</MenuItem>
+                <MenuItem value="pcm">PCM (raw data, specialized use)</MenuItem>
+                <MenuItem value="flac">FLAC (lossless, large files)</MenuItem>
+              </TextField>
+            </Box>
+          </Stack>
+
+          {/* Sync Mode */}
+          <Box>
+            <FormControlLabel
+              control={
+                <Checkbox
+                  checked={settings.enableSyncMode}
+                  onChange={(e) => setSettings({ ...settings, enableSyncMode: e.target.checked })}
+                  sx={{ color: "rgba(255,255,255,0.7)" }}
+                />
+              }
+              label={
+                <Typography variant="body2" sx={{ color: "white" }}>
+                  Enable sync mode (recommended)
+                </Typography>
+              }
+            />
+            <Typography variant="caption" sx={{ opacity: 0.7 }}>
+              When enabled, waits for generation to complete before proceeding. Recommended for reliable audio delivery.
+            </Typography>
+          </Box>
+
+          {/* Pro Tips */}
+          <Box sx={{ mt: 2, p: 2, bgcolor: alpha("#ffffff", 0.05), borderRadius: 1, border: "1px solid rgba(255,255,255,0.1)" }}>
+            <Typography variant="subtitle2" sx={{ fontWeight: 600, mb: 1, color: "white" }}>
+              💡 Human-Like Audio Tips
+            </Typography>
+            <Typography variant="caption" sx={{ opacity: 0.8, display: "block", mb: 0.5 }}>
+              • <strong>Voice Choice</strong>: "Casual_Guy" provides natural, conversational delivery perfect for YouTube
+            </Typography>
+            <Typography variant="caption" sx={{ opacity: 0.8, display: "block", mb: 0.5 }}>
+              • <strong>Speed</strong>: 1.15 provides engaging pace - not too slow, not too fast, just right for viewers
+            </Typography>
+            <Typography variant="caption" sx={{ opacity: 0.8, display: "block", mb: 0.5 }}>
+              • <strong>Emotion</strong>: "Happy" creates natural, positive delivery that keeps viewers engaged
+            </Typography>
+            <Typography variant="caption" sx={{ opacity: 0.8, display: "block", mb: 0.5 }}>
+              • <strong>Quality</strong>: 128kbps MP3 provides professional quality with optimal file sizes
+            </Typography>
+            <Typography variant="caption" sx={{ opacity: 0.8, display: "block" }}>
+              • <strong>Enhancement</strong>: English normalization improves pronunciation of numbers, dates, and technical terms
+            </Typography>
+          </Box>
+        </Stack>
+      </DialogContent>
+
+      <DialogActions sx={{ p: 3, pt: 2 }}>
+        <Button
+          onClick={onClose}
+          disabled={isGenerating}
+          sx={{ color: "rgba(255,255,255,0.7)" }}
+        >
+          Cancel
+        </Button>
+        <Button
+          onClick={handleApply}
+          variant="contained"
+          disabled={isGenerating}
+          startIcon={isGenerating ? undefined : <VolumeUp />}
+          sx={{
+            backgroundColor: "#4ade80",
+            "&:hover": { backgroundColor: "#22c55e" },
+            "&:disabled": { backgroundColor: "rgba(255,255,255,0.2)" },
+          }}
+        >
+          {isGenerating ? "Generating..." : "Apply Settings & Generate"}
+        </Button>
+      </DialogActions>
+    </Dialog>
+  );
+};
--- a/frontend/src/components/shared/index.ts
+++ b/frontend/src/components/shared/index.ts
@@ -18,4 +18,8 @@ export * from './utils';

 // Asset Library modal (images only)
 export { AssetLibraryImageModal } from './AssetLibraryImageModal';
-export type { AssetLibraryImageModalProps } from './AssetLibraryImageModal';
+export type { AssetLibraryImageModalProps } from './AssetLibraryImageModal';
+
+// Audio Settings modal (shared across tools)
+export { AudioSettingsModal } from './AudioSettingsModal';
+export type { AudioGenerationSettings } from './AudioSettingsModal';