SEO Dashboard Fixes and content planning refactoring

2025-10-29 17:10:48 +05:30
parent 5866f49325
commit 4431cd9848
92 changed files with 7046 additions and 1940 deletions
--- a/backend/services/blog_writer/circuit_breaker.py
+++ b/backend/services/blog_writer/circuit_breaker.py
@@ -0,0 +1,209 @@
+"""
+Circuit Breaker Pattern for Blog Writer API Calls
+
+Implements circuit breaker pattern to prevent cascading failures when external APIs
+are experiencing issues. Tracks failure rates and automatically disables calls when
+threshold is exceeded, with auto-recovery after cooldown period.
+"""
+
+import time
+import asyncio
+from typing import Callable, Any, Optional, Dict
+from enum import Enum
+from dataclasses import dataclass
+from loguru import logger
+
+from .exceptions import CircuitBreakerOpenException
+
+
+class CircuitState(Enum):
+    """Circuit breaker states."""
+    CLOSED = "closed"  # Normal operation
+    OPEN = "open"  # Circuit is open, calls are blocked
+    HALF_OPEN = "half_open"  # Testing if service is back
+
+
+@dataclass
+class CircuitBreakerConfig:
+    """Configuration for circuit breaker."""
+    failure_threshold: int = 5  # Number of failures before opening
+    recovery_timeout: int = 60  # Seconds to wait before trying again
+    success_threshold: int = 3  # Successes needed to close from half-open
+    timeout: int = 30  # Timeout for individual calls
+    max_failures_per_minute: int = 10  # Max failures per minute before opening
+
+
+class CircuitBreaker:
+    """Circuit breaker implementation for API calls."""
+    
+    def __init__(self, name: str, config: Optional[CircuitBreakerConfig] = None):
+        self.name = name
+        self.config = config or CircuitBreakerConfig()
+        self.state = CircuitState.CLOSED
+        self.failure_count = 0
+        self.success_count = 0
+        self.last_failure_time = 0
+        self.last_success_time = 0
+        self.failure_times = []  # Track failure times for rate limiting
+        self._lock = asyncio.Lock()
+    
+    async def call(self, func: Callable, *args, **kwargs) -> Any:
+        """
+        Execute function with circuit breaker protection.
+        
+        Args:
+            func: Function to execute
+            *args: Function arguments
+            **kwargs: Function keyword arguments
+            
+        Returns:
+            Function result
+            
+        Raises:
+            CircuitBreakerOpenException: If circuit is open
+        """
+        async with self._lock:
+            # Check if circuit should be opened due to rate limiting
+            await self._check_rate_limit()
+            
+            # Check circuit state
+            if self.state == CircuitState.OPEN:
+                if self._should_attempt_reset():
+                    self.state = CircuitState.HALF_OPEN
+                    self.success_count = 0
+                    logger.info(f"Circuit breaker {self.name} transitioning to HALF_OPEN")
+                else:
+                    retry_after = int(self.config.recovery_timeout - (time.time() - self.last_failure_time))
+                    raise CircuitBreakerOpenException(
+                        f"Circuit breaker {self.name} is OPEN",
+                        retry_after=max(0, retry_after),
+                        context={"circuit_name": self.name, "state": self.state.value}
+                    )
+        
+        try:
+            # Execute the function with timeout
+            result = await asyncio.wait_for(
+                func(*args, **kwargs),
+                timeout=self.config.timeout
+            )
+            
+            # Record success
+            await self._record_success()
+            return result
+            
+        except asyncio.TimeoutError:
+            await self._record_failure("timeout")
+            raise
+        except Exception as e:
+            await self._record_failure(str(e))
+            raise
+    
+    async def _check_rate_limit(self):
+        """Check if failure rate exceeds threshold."""
+        current_time = time.time()
+        
+        # Remove failures older than 1 minute
+        self.failure_times = [
+            failure_time for failure_time in self.failure_times
+            if current_time - failure_time < 60
+        ]
+        
+        # Check if we've exceeded the rate limit
+        if len(self.failure_times) >= self.config.max_failures_per_minute:
+            self.state = CircuitState.OPEN
+            self.last_failure_time = current_time
+            logger.warning(f"Circuit breaker {self.name} opened due to rate limit: {len(self.failure_times)} failures in last minute")
+    
+    def _should_attempt_reset(self) -> bool:
+        """Check if enough time has passed to attempt reset."""
+        return time.time() - self.last_failure_time >= self.config.recovery_timeout
+    
+    async def _record_success(self):
+        """Record a successful call."""
+        async with self._lock:
+            self.last_success_time = time.time()
+            
+            if self.state == CircuitState.HALF_OPEN:
+                self.success_count += 1
+                if self.success_count >= self.config.success_threshold:
+                    self.state = CircuitState.CLOSED
+                    self.failure_count = 0
+                    logger.info(f"Circuit breaker {self.name} closed after {self.success_count} successes")
+            elif self.state == CircuitState.CLOSED:
+                # Reset failure count on success
+                self.failure_count = 0
+    
+    async def _record_failure(self, error: str):
+        """Record a failed call."""
+        async with self._lock:
+            current_time = time.time()
+            self.failure_count += 1
+            self.last_failure_time = current_time
+            self.failure_times.append(current_time)
+            
+            logger.warning(f"Circuit breaker {self.name} recorded failure #{self.failure_count}: {error}")
+            
+            # Open circuit if threshold exceeded
+            if self.failure_count >= self.config.failure_threshold:
+                self.state = CircuitState.OPEN
+                logger.error(f"Circuit breaker {self.name} opened after {self.failure_count} failures")
+    
+    def get_state(self) -> Dict[str, Any]:
+        """Get current circuit breaker state."""
+        return {
+            "name": self.name,
+            "state": self.state.value,
+            "failure_count": self.failure_count,
+            "success_count": self.success_count,
+            "last_failure_time": self.last_failure_time,
+            "last_success_time": self.last_success_time,
+            "failures_in_last_minute": len([
+                t for t in self.failure_times 
+                if time.time() - t < 60
+            ])
+        }
+
+
+class CircuitBreakerManager:
+    """Manages multiple circuit breakers."""
+    
+    def __init__(self):
+        self._breakers: Dict[str, CircuitBreaker] = {}
+    
+    def get_breaker(self, name: str, config: Optional[CircuitBreakerConfig] = None) -> CircuitBreaker:
+        """Get or create a circuit breaker."""
+        if name not in self._breakers:
+            self._breakers[name] = CircuitBreaker(name, config)
+        return self._breakers[name]
+    
+    def get_all_states(self) -> Dict[str, Dict[str, Any]]:
+        """Get states of all circuit breakers."""
+        return {name: breaker.get_state() for name, breaker in self._breakers.items()}
+    
+    def reset_breaker(self, name: str):
+        """Reset a circuit breaker to closed state."""
+        if name in self._breakers:
+            self._breakers[name].state = CircuitState.CLOSED
+            self._breakers[name].failure_count = 0
+            self._breakers[name].success_count = 0
+            logger.info(f"Circuit breaker {name} manually reset")
+
+
+# Global circuit breaker manager
+circuit_breaker_manager = CircuitBreakerManager()
+
+
+def circuit_breaker(name: str, config: Optional[CircuitBreakerConfig] = None):
+    """
+    Decorator to add circuit breaker protection to async functions.
+    
+    Args:
+        name: Circuit breaker name
+        config: Circuit breaker configuration
+    """
+    def decorator(func: Callable) -> Callable:
+        async def wrapper(*args, **kwargs):
+            breaker = circuit_breaker_manager.get_breaker(name, config)
+            return await breaker.call(func, *args, **kwargs)
+        return wrapper
+    return decorator
--- a/backend/services/blog_writer/database_task_manager.py
+++ b/backend/services/blog_writer/database_task_manager.py
@@ -0,0 +1,536 @@
+"""
+Database-Backed Task Manager for Blog Writer
+
+Replaces in-memory task storage with persistent database storage for
+reliability, recovery, and analytics.
+"""
+
+import asyncio
+import uuid
+import json
+from datetime import datetime, timedelta
+from typing import Any, Dict, List, Optional
+from loguru import logger
+
+from services.blog_writer.logger_config import blog_writer_logger, log_function_call
+from models.blog_models import (
+    BlogResearchRequest,
+    BlogOutlineRequest,
+    MediumBlogGenerateRequest,
+    MediumBlogGenerateResult,
+)
+from services.blog_writer.blog_service import BlogWriterService
+
+
+class DatabaseTaskManager:
+    """Database-backed task manager for blog writer operations."""
+    
+    def __init__(self, db_connection):
+        self.db = db_connection
+        self.service = BlogWriterService()
+        self._cleanup_task = None
+        self._start_cleanup_task()
+    
+    def _start_cleanup_task(self):
+        """Start background task to clean up old completed tasks."""
+        async def cleanup_loop():
+            while True:
+                try:
+                    await self.cleanup_old_tasks()
+                    await asyncio.sleep(3600)  # Run every hour
+                except Exception as e:
+                    logger.error(f"Error in cleanup task: {e}")
+                    await asyncio.sleep(300)  # Wait 5 minutes on error
+        
+        self._cleanup_task = asyncio.create_task(cleanup_loop())
+    
+    @log_function_call("create_task")
+    async def create_task(
+        self, 
+        user_id: str,
+        task_type: str,
+        request_data: Dict[str, Any],
+        correlation_id: Optional[str] = None,
+        operation: Optional[str] = None,
+        priority: int = 0,
+        max_retries: int = 3,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> str:
+        """Create a new task in the database."""
+        task_id = str(uuid.uuid4())
+        correlation_id = correlation_id or str(uuid.uuid4())
+        
+        query = """
+        INSERT INTO blog_writer_tasks 
+        (id, user_id, task_type, status, request_data, correlation_id, operation, priority, max_retries, metadata)
+        VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
+        """
+        
+        await self.db.execute(
+            query,
+            task_id,
+            user_id,
+            task_type,
+            'pending',
+            json.dumps(request_data),
+            correlation_id,
+            operation,
+            priority,
+            max_retries,
+            json.dumps(metadata or {})
+        )
+        
+        blog_writer_logger.log_operation_start(
+            "task_created",
+            task_id=task_id,
+            task_type=task_type,
+            user_id=user_id,
+            correlation_id=correlation_id
+        )
+        
+        return task_id
+    
+    @log_function_call("get_task_status")
+    async def get_task_status(self, task_id: str) -> Optional[Dict[str, Any]]:
+        """Get the status of a task."""
+        query = """
+        SELECT 
+            id, user_id, task_type, status, request_data, result_data, error_data,
+            created_at, updated_at, completed_at, correlation_id, operation,
+            retry_count, max_retries, priority, metadata
+        FROM blog_writer_tasks 
+        WHERE id = $1
+        """
+        
+        row = await self.db.fetchrow(query, task_id)
+        if not row:
+            return None
+        
+        # Get progress messages
+        progress_query = """
+        SELECT timestamp, message, percentage, progress_type, metadata
+        FROM blog_writer_task_progress 
+        WHERE task_id = $1 
+        ORDER BY timestamp DESC 
+        LIMIT 10
+        """
+        
+        progress_rows = await self.db.fetch(progress_query, task_id)
+        progress_messages = [
+            {
+                "timestamp": row["timestamp"].isoformat(),
+                "message": row["message"],
+                "percentage": float(row["percentage"]),
+                "progress_type": row["progress_type"],
+                "metadata": row["metadata"] or {}
+            }
+            for row in progress_rows
+        ]
+        
+        return {
+            "task_id": row["id"],
+            "user_id": row["user_id"],
+            "task_type": row["task_type"],
+            "status": row["status"],
+            "created_at": row["created_at"].isoformat(),
+            "updated_at": row["updated_at"].isoformat(),
+            "completed_at": row["completed_at"].isoformat() if row["completed_at"] else None,
+            "correlation_id": row["correlation_id"],
+            "operation": row["operation"],
+            "retry_count": row["retry_count"],
+            "max_retries": row["max_retries"],
+            "priority": row["priority"],
+            "progress_messages": progress_messages,
+            "result": json.loads(row["result_data"]) if row["result_data"] else None,
+            "error": json.loads(row["error_data"]) if row["error_data"] else None,
+            "metadata": json.loads(row["metadata"]) if row["metadata"] else {}
+        }
+    
+    @log_function_call("update_task_status")
+    async def update_task_status(
+        self,
+        task_id: str,
+        status: str,
+        result_data: Optional[Dict[str, Any]] = None,
+        error_data: Optional[Dict[str, Any]] = None,
+        completed_at: Optional[datetime] = None
+    ):
+        """Update task status and data."""
+        query = """
+        UPDATE blog_writer_tasks 
+        SET status = $2, result_data = $3, error_data = $4, completed_at = $5, updated_at = NOW()
+        WHERE id = $1
+        """
+        
+        await self.db.execute(
+            query,
+            task_id,
+            status,
+            json.dumps(result_data) if result_data else None,
+            json.dumps(error_data) if error_data else None,
+            completed_at or (datetime.now() if status in ['completed', 'failed', 'cancelled'] else None)
+        )
+        
+        blog_writer_logger.log_operation_end(
+            "task_status_updated",
+            0,
+            success=status in ['completed', 'cancelled'],
+            task_id=task_id,
+            status=status
+        )
+    
+    @log_function_call("update_progress")
+    async def update_progress(
+        self,
+        task_id: str,
+        message: str,
+        percentage: Optional[float] = None,
+        progress_type: str = "info",
+        metadata: Optional[Dict[str, Any]] = None
+    ):
+        """Update task progress."""
+        # Insert progress record
+        progress_query = """
+        INSERT INTO blog_writer_task_progress 
+        (task_id, message, percentage, progress_type, metadata)
+        VALUES ($1, $2, $3, $4, $5)
+        """
+        
+        await self.db.execute(
+            progress_query,
+            task_id,
+            message,
+            percentage or 0.0,
+            progress_type,
+            json.dumps(metadata or {})
+        )
+        
+        # Update task status to running if it was pending
+        status_query = """
+        UPDATE blog_writer_tasks 
+        SET status = 'running', updated_at = NOW()
+        WHERE id = $1 AND status = 'pending'
+        """
+        
+        await self.db.execute(status_query, task_id)
+        
+        logger.info(f"Progress update for task {task_id}: {message}")
+    
+    @log_function_call("record_metrics")
+    async def record_metrics(
+        self,
+        task_id: str,
+        operation: str,
+        duration_ms: int,
+        token_usage: Optional[Dict[str, int]] = None,
+        api_calls: int = 0,
+        cache_hits: int = 0,
+        cache_misses: int = 0,
+        error_count: int = 0,
+        metadata: Optional[Dict[str, Any]] = None
+    ):
+        """Record performance metrics for a task."""
+        query = """
+        INSERT INTO blog_writer_task_metrics 
+        (task_id, operation, duration_ms, token_usage, api_calls, cache_hits, cache_misses, error_count, metadata)
+        VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
+        """
+        
+        await self.db.execute(
+            query,
+            task_id,
+            operation,
+            duration_ms,
+            json.dumps(token_usage) if token_usage else None,
+            api_calls,
+            cache_hits,
+            cache_misses,
+            error_count,
+            json.dumps(metadata or {})
+        )
+        
+        blog_writer_logger.log_performance(
+            f"task_metrics_{operation}",
+            duration_ms,
+            "ms",
+            task_id=task_id,
+            operation=operation,
+            api_calls=api_calls,
+            cache_hits=cache_hits,
+            cache_misses=cache_misses
+        )
+    
+    @log_function_call("increment_retry_count")
+    async def increment_retry_count(self, task_id: str) -> int:
+        """Increment retry count and return new count."""
+        query = """
+        UPDATE blog_writer_tasks 
+        SET retry_count = retry_count + 1, updated_at = NOW()
+        WHERE id = $1
+        RETURNING retry_count
+        """
+        
+        result = await self.db.fetchval(query, task_id)
+        return result or 0
+    
+    @log_function_call("cleanup_old_tasks")
+    async def cleanup_old_tasks(self, days: int = 7) -> int:
+        """Clean up old completed tasks."""
+        query = """
+        DELETE FROM blog_writer_tasks 
+        WHERE status IN ('completed', 'failed', 'cancelled') 
+        AND created_at < NOW() - INTERVAL '%s days'
+        """ % days
+        
+        result = await self.db.execute(query)
+        deleted_count = int(result.split()[-1]) if result else 0
+        
+        if deleted_count > 0:
+            logger.info(f"Cleaned up {deleted_count} old blog writer tasks")
+        
+        return deleted_count
+    
+    @log_function_call("get_user_tasks")
+    async def get_user_tasks(
+        self,
+        user_id: str,
+        limit: int = 50,
+        offset: int = 0,
+        status_filter: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        """Get tasks for a specific user."""
+        query = """
+        SELECT 
+            id, task_type, status, created_at, updated_at, completed_at,
+            operation, retry_count, max_retries, priority
+        FROM blog_writer_tasks 
+        WHERE user_id = $1
+        """
+        
+        params = [user_id]
+        param_count = 1
+        
+        if status_filter:
+            param_count += 1
+            query += f" AND status = ${param_count}"
+            params.append(status_filter)
+        
+        query += f" ORDER BY created_at DESC LIMIT ${param_count + 1} OFFSET ${param_count + 2}"
+        params.extend([limit, offset])
+        
+        rows = await self.db.fetch(query, *params)
+        
+        return [
+            {
+                "task_id": row["id"],
+                "task_type": row["task_type"],
+                "status": row["status"],
+                "created_at": row["created_at"].isoformat(),
+                "updated_at": row["updated_at"].isoformat(),
+                "completed_at": row["completed_at"].isoformat() if row["completed_at"] else None,
+                "operation": row["operation"],
+                "retry_count": row["retry_count"],
+                "max_retries": row["max_retries"],
+                "priority": row["priority"]
+            }
+            for row in rows
+        ]
+    
+    @log_function_call("get_task_analytics")
+    async def get_task_analytics(self, days: int = 7) -> Dict[str, Any]:
+        """Get task analytics for monitoring."""
+        query = """
+        SELECT 
+            task_type,
+            status,
+            COUNT(*) as task_count,
+            AVG(EXTRACT(EPOCH FROM (COALESCE(completed_at, NOW()) - created_at))) as avg_duration_seconds,
+            COUNT(CASE WHEN status = 'completed' THEN 1 END) as completed_count,
+            COUNT(CASE WHEN status = 'failed' THEN 1 END) as failed_count,
+            COUNT(CASE WHEN status = 'running' THEN 1 END) as running_count
+        FROM blog_writer_tasks
+        WHERE created_at >= NOW() - INTERVAL '%s days'
+        GROUP BY task_type, status
+        ORDER BY task_type, status
+        """ % days
+        
+        rows = await self.db.fetch(query)
+        
+        analytics = {
+            "summary": {
+                "total_tasks": sum(row["task_count"] for row in rows),
+                "completed_tasks": sum(row["completed_count"] for row in rows),
+                "failed_tasks": sum(row["failed_count"] for row in rows),
+                "running_tasks": sum(row["running_count"] for row in rows)
+            },
+            "by_task_type": {},
+            "by_status": {}
+        }
+        
+        for row in rows:
+            task_type = row["task_type"]
+            status = row["status"]
+            
+            if task_type not in analytics["by_task_type"]:
+                analytics["by_task_type"][task_type] = {}
+            
+            analytics["by_task_type"][task_type][status] = {
+                "count": row["task_count"],
+                "avg_duration_seconds": float(row["avg_duration_seconds"]) if row["avg_duration_seconds"] else 0
+            }
+            
+            if status not in analytics["by_status"]:
+                analytics["by_status"][status] = 0
+            analytics["by_status"][status] += row["task_count"]
+        
+        return analytics
+    
+    # Task execution methods (same as original but with database persistence)
+    async def start_research_task(self, request: BlogResearchRequest, user_id: str) -> str:
+        """Start a research operation and return a task ID."""
+        task_id = await self.create_task(
+            user_id=user_id,
+            task_type="research",
+            request_data=request.dict(),
+            operation="research_operation"
+        )
+        
+        # Start the research operation in the background
+        asyncio.create_task(self._run_research_task(task_id, request))
+        
+        return task_id
+    
+    async def start_outline_task(self, request: BlogOutlineRequest, user_id: str) -> str:
+        """Start an outline generation operation and return a task ID."""
+        task_id = await self.create_task(
+            user_id=user_id,
+            task_type="outline",
+            request_data=request.dict(),
+            operation="outline_generation"
+        )
+        
+        # Start the outline generation operation in the background
+        asyncio.create_task(self._run_outline_generation_task(task_id, request))
+        
+        return task_id
+    
+    async def start_medium_generation_task(self, request: MediumBlogGenerateRequest, user_id: str) -> str:
+        """Start a medium blog generation task."""
+        task_id = await self.create_task(
+            user_id=user_id,
+            task_type="medium_generation",
+            request_data=request.dict(),
+            operation="medium_blog_generation"
+        )
+        
+        asyncio.create_task(self._run_medium_generation_task(task_id, request))
+        return task_id
+    
+    async def _run_research_task(self, task_id: str, request: BlogResearchRequest):
+        """Background task to run research and update status with progress messages."""
+        try:
+            await self.update_progress(task_id, "🔍 Starting research operation...", 0)
+            
+            # Run the actual research with progress updates
+            result = await self.service.research_with_progress(request, task_id)
+            
+            # Check if research failed gracefully
+            if not result.success:
+                await self.update_progress(
+                    task_id, 
+                    f"❌ Research failed: {result.error_message or 'Unknown error'}", 
+                    100, 
+                    "error"
+                )
+                await self.update_task_status(
+                    task_id, 
+                    "failed", 
+                    error_data={
+                        "error_message": result.error_message,
+                        "retry_suggested": result.retry_suggested,
+                        "error_code": result.error_code,
+                        "actionable_steps": result.actionable_steps
+                    }
+                )
+            else:
+                await self.update_progress(
+                    task_id, 
+                    f"✅ Research completed successfully! Found {len(result.sources)} sources and {len(result.search_queries or [])} search queries.", 
+                    100, 
+                    "success"
+                )
+                await self.update_task_status(
+                    task_id, 
+                    "completed", 
+                    result_data=result.dict()
+                )
+            
+        except Exception as e:
+            await self.update_progress(task_id, f"❌ Research failed with error: {str(e)}", 100, "error")
+            await self.update_task_status(
+                task_id, 
+                "failed", 
+                error_data={"error_message": str(e), "error_type": type(e).__name__}
+            )
+            blog_writer_logger.log_error(e, "research_task", context={"task_id": task_id})
+    
+    async def _run_outline_generation_task(self, task_id: str, request: BlogOutlineRequest):
+        """Background task to run outline generation and update status with progress messages."""
+        try:
+            await self.update_progress(task_id, "🧩 Starting outline generation...", 0)
+            
+            # Run the actual outline generation with progress updates
+            result = await self.service.generate_outline_with_progress(request, task_id)
+            
+            await self.update_progress(
+                task_id, 
+                f"✅ Outline generated successfully! Created {len(result.outline)} sections with {len(result.title_options)} title options.", 
+                100, 
+                "success"
+            )
+            await self.update_task_status(task_id, "completed", result_data=result.dict())
+            
+        except Exception as e:
+            await self.update_progress(task_id, f"❌ Outline generation failed: {str(e)}", 100, "error")
+            await self.update_task_status(
+                task_id, 
+                "failed", 
+                error_data={"error_message": str(e), "error_type": type(e).__name__}
+            )
+            blog_writer_logger.log_error(e, "outline_generation_task", context={"task_id": task_id})
+    
+    async def _run_medium_generation_task(self, task_id: str, request: MediumBlogGenerateRequest):
+        """Background task to generate a medium blog using a single structured JSON call."""
+        try:
+            await self.update_progress(task_id, "📦 Packaging outline and metadata...", 0)
+            
+            # Basic guard: respect global target words
+            total_target = int(request.globalTargetWords or 1000)
+            if total_target > 1000:
+                raise ValueError("Global target words exceed 1000; medium generation not allowed")
+            
+            result: MediumBlogGenerateResult = await self.service.generate_medium_blog_with_progress(
+                request,
+                task_id,
+            )
+            
+            if not result or not getattr(result, "sections", None):
+                raise ValueError("Empty generation result from model")
+            
+            # Check if result came from cache
+            cache_hit = getattr(result, 'cache_hit', False)
+            if cache_hit:
+                await self.update_progress(task_id, "⚡ Found cached content - loading instantly!", 100, "success")
+            else:
+                await self.update_progress(task_id, "🤖 Generated fresh content with AI...", 100, "success")
+            
+            await self.update_task_status(task_id, "completed", result_data=result.dict())
+            
+        except Exception as e:
+            await self.update_progress(task_id, f"❌ Medium generation failed: {str(e)}", 100, "error")
+            await self.update_task_status(
+                task_id, 
+                "failed", 
+                error_data={"error_message": str(e), "error_type": type(e).__name__}
+            )
+            blog_writer_logger.log_error(e, "medium_generation_task", context={"task_id": task_id})
--- a/backend/services/blog_writer/exceptions.py
+++ b/backend/services/blog_writer/exceptions.py
@@ -0,0 +1,285 @@
+"""
+Blog Writer Exception Hierarchy
+
+Defines custom exception classes for different failure modes in the AI Blog Writer.
+Each exception includes error_code, user_message, retry_suggested, and actionable_steps.
+"""
+
+from typing import List, Optional, Dict, Any
+from enum import Enum
+
+
+class ErrorCategory(Enum):
+    """Categories for error classification."""
+    TRANSIENT = "transient"  # Temporary issues, retry recommended
+    PERMANENT = "permanent"  # Permanent issues, no retry
+    USER_ERROR = "user_error"  # User input issues, fix input
+    API_ERROR = "api_error"  # External API issues
+    VALIDATION_ERROR = "validation_error"  # Data validation issues
+    SYSTEM_ERROR = "system_error"  # Internal system issues
+
+
+class BlogWriterException(Exception):
+    """Base exception for all Blog Writer errors."""
+    
+    def __init__(
+        self,
+        message: str,
+        error_code: str,
+        user_message: str,
+        retry_suggested: bool = False,
+        actionable_steps: Optional[List[str]] = None,
+        error_category: ErrorCategory = ErrorCategory.SYSTEM_ERROR,
+        context: Optional[Dict[str, Any]] = None
+    ):
+        super().__init__(message)
+        self.error_code = error_code
+        self.user_message = user_message
+        self.retry_suggested = retry_suggested
+        self.actionable_steps = actionable_steps or []
+        self.error_category = error_category
+        self.context = context or {}
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert exception to dictionary for API responses."""
+        return {
+            "error_code": self.error_code,
+            "user_message": self.user_message,
+            "retry_suggested": self.retry_suggested,
+            "actionable_steps": self.actionable_steps,
+            "error_category": self.error_category.value,
+            "context": self.context
+        }
+
+
+class ResearchFailedException(BlogWriterException):
+    """Raised when research operation fails."""
+    
+    def __init__(
+        self,
+        message: str,
+        user_message: str = "Research failed. Please try again with different keywords or check your internet connection.",
+        retry_suggested: bool = True,
+        context: Optional[Dict[str, Any]] = None
+    ):
+        super().__init__(
+            message=message,
+            error_code="RESEARCH_FAILED",
+            user_message=user_message,
+            retry_suggested=retry_suggested,
+            actionable_steps=[
+                "Try with different keywords",
+                "Check your internet connection",
+                "Wait a few minutes and try again",
+                "Contact support if the issue persists"
+            ],
+            error_category=ErrorCategory.API_ERROR,
+            context=context
+        )
+
+
+class OutlineGenerationException(BlogWriterException):
+    """Raised when outline generation fails."""
+    
+    def __init__(
+        self,
+        message: str,
+        user_message: str = "Outline generation failed. Please try again or adjust your research data.",
+        retry_suggested: bool = True,
+        context: Optional[Dict[str, Any]] = None
+    ):
+        super().__init__(
+            message=message,
+            error_code="OUTLINE_GENERATION_FAILED",
+            user_message=user_message,
+            retry_suggested=retry_suggested,
+            actionable_steps=[
+                "Try generating outline again",
+                "Check if research data is complete",
+                "Try with different research keywords",
+                "Contact support if the issue persists"
+            ],
+            error_category=ErrorCategory.API_ERROR,
+            context=context
+        )
+
+
+class ContentGenerationException(BlogWriterException):
+    """Raised when content generation fails."""
+    
+    def __init__(
+        self,
+        message: str,
+        user_message: str = "Content generation failed. Please try again or adjust your outline.",
+        retry_suggested: bool = True,
+        context: Optional[Dict[str, Any]] = None
+    ):
+        super().__init__(
+            message=message,
+            error_code="CONTENT_GENERATION_FAILED",
+            user_message=user_message,
+            retry_suggested=retry_suggested,
+            actionable_steps=[
+                "Try generating content again",
+                "Check if outline is complete",
+                "Try with a shorter outline",
+                "Contact support if the issue persists"
+            ],
+            error_category=ErrorCategory.API_ERROR,
+            context=context
+        )
+
+
+class SEOAnalysisException(BlogWriterException):
+    """Raised when SEO analysis fails."""
+    
+    def __init__(
+        self,
+        message: str,
+        user_message: str = "SEO analysis failed. Content was generated but SEO optimization is unavailable.",
+        retry_suggested: bool = True,
+        context: Optional[Dict[str, Any]] = None
+    ):
+        super().__init__(
+            message=message,
+            error_code="SEO_ANALYSIS_FAILED",
+            user_message=user_message,
+            retry_suggested=retry_suggested,
+            actionable_steps=[
+                "Try SEO analysis again",
+                "Continue without SEO optimization",
+                "Contact support if the issue persists"
+            ],
+            error_category=ErrorCategory.API_ERROR,
+            context=context
+        )
+
+
+class APIRateLimitException(BlogWriterException):
+    """Raised when API rate limit is exceeded."""
+    
+    def __init__(
+        self,
+        message: str,
+        retry_after: Optional[int] = None,
+        context: Optional[Dict[str, Any]] = None
+    ):
+        retry_message = f"Rate limit exceeded. Please wait {retry_after} seconds before trying again." if retry_after else "Rate limit exceeded. Please wait a few minutes before trying again."
+        
+        super().__init__(
+            message=message,
+            error_code="API_RATE_LIMIT",
+            user_message=retry_message,
+            retry_suggested=True,
+            actionable_steps=[
+                f"Wait {retry_after or 60} seconds before trying again",
+                "Reduce the frequency of requests",
+                "Try again during off-peak hours",
+                "Contact support if you need higher limits"
+            ],
+            error_category=ErrorCategory.API_ERROR,
+            context=context
+        )
+
+
+class APITimeoutException(BlogWriterException):
+    """Raised when API request times out."""
+    
+    def __init__(
+        self,
+        message: str,
+        timeout_seconds: int = 60,
+        context: Optional[Dict[str, Any]] = None
+    ):
+        super().__init__(
+            message=message,
+            error_code="API_TIMEOUT",
+            user_message=f"Request timed out after {timeout_seconds} seconds. Please try again.",
+            retry_suggested=True,
+            actionable_steps=[
+                "Try again with a shorter request",
+                "Check your internet connection",
+                "Try again during off-peak hours",
+                "Contact support if the issue persists"
+            ],
+            error_category=ErrorCategory.TRANSIENT,
+            context=context
+        )
+
+
+class ValidationException(BlogWriterException):
+    """Raised when input validation fails."""
+    
+    def __init__(
+        self,
+        message: str,
+        field: str,
+        user_message: str = "Invalid input provided. Please check your data and try again.",
+        context: Optional[Dict[str, Any]] = None
+    ):
+        super().__init__(
+            message=message,
+            error_code="VALIDATION_ERROR",
+            user_message=user_message,
+            retry_suggested=False,
+            actionable_steps=[
+                f"Check the {field} field",
+                "Ensure all required fields are filled",
+                "Verify data format is correct",
+                "Contact support if you need help"
+            ],
+            error_category=ErrorCategory.USER_ERROR,
+            context=context
+        )
+
+
+class CircuitBreakerOpenException(BlogWriterException):
+    """Raised when circuit breaker is open."""
+    
+    def __init__(
+        self,
+        message: str,
+        retry_after: int,
+        context: Optional[Dict[str, Any]] = None
+    ):
+        super().__init__(
+            message=message,
+            error_code="CIRCUIT_BREAKER_OPEN",
+            user_message=f"Service temporarily unavailable. Please wait {retry_after} seconds before trying again.",
+            retry_suggested=True,
+            actionable_steps=[
+                f"Wait {retry_after} seconds before trying again",
+                "Try again during off-peak hours",
+                "Contact support if the issue persists"
+            ],
+            error_category=ErrorCategory.TRANSIENT,
+            context=context
+        )
+
+
+class PartialSuccessException(BlogWriterException):
+    """Raised when operation partially succeeds."""
+    
+    def __init__(
+        self,
+        message: str,
+        partial_results: Dict[str, Any],
+        failed_operations: List[str],
+        user_message: str = "Operation partially completed. Some sections were generated successfully.",
+        context: Optional[Dict[str, Any]] = None
+    ):
+        super().__init__(
+            message=message,
+            error_code="PARTIAL_SUCCESS",
+            user_message=user_message,
+            retry_suggested=True,
+            actionable_steps=[
+                "Review the generated content",
+                "Retry failed sections individually",
+                "Contact support if you need help with failed sections"
+            ],
+            error_category=ErrorCategory.TRANSIENT,
+            context=context
+        )
+        self.partial_results = partial_results
+        self.failed_operations = failed_operations
--- a/backend/services/blog_writer/logger_config.py
+++ b/backend/services/blog_writer/logger_config.py
@@ -0,0 +1,293 @@
+"""
+Structured Logging Configuration for Blog Writer
+
+Configures structured JSON logging with correlation IDs, context tracking,
+and performance metrics for the AI Blog Writer system.
+"""
+
+import json
+import uuid
+import time
+import sys
+from typing import Dict, Any, Optional
+from contextvars import ContextVar
+from loguru import logger
+from datetime import datetime
+
+# Context variables for request tracking
+correlation_id: ContextVar[str] = ContextVar('correlation_id', default='')
+user_id: ContextVar[str] = ContextVar('user_id', default='')
+task_id: ContextVar[str] = ContextVar('task_id', default='')
+operation: ContextVar[str] = ContextVar('operation', default='')
+
+
+class BlogWriterLogger:
+    """Enhanced logger for Blog Writer with structured logging and context tracking."""
+    
+    def __init__(self):
+        self._setup_logger()
+    
+    def _setup_logger(self):
+        """Configure loguru with structured JSON output."""
+        from utils.logger_utils import get_service_logger
+        return get_service_logger("blog_writer")
+    
+    def _json_formatter(self, record):
+        """Format log record as structured JSON."""
+        # Extract context variables
+        correlation_id_val = correlation_id.get('')
+        user_id_val = user_id.get('')
+        task_id_val = task_id.get('')
+        operation_val = operation.get('')
+        
+        # Build structured log entry
+        log_entry = {
+            "timestamp": datetime.fromtimestamp(record["time"].timestamp()).isoformat(),
+            "level": record["level"].name,
+            "logger": record["name"],
+            "function": record["function"],
+            "line": record["line"],
+            "message": record["message"],
+            "correlation_id": correlation_id_val,
+            "user_id": user_id_val,
+            "task_id": task_id_val,
+            "operation": operation_val,
+            "module": record["module"],
+            "process_id": record["process"].id,
+            "thread_id": record["thread"].id
+        }
+        
+        # Add exception info if present
+        if record["exception"]:
+            log_entry["exception"] = {
+                "type": record["exception"].type.__name__,
+                "value": str(record["exception"].value),
+                "traceback": record["exception"].traceback
+            }
+        
+        # Add extra fields from record
+        if record["extra"]:
+            log_entry.update(record["extra"])
+        
+        return json.dumps(log_entry, default=str)
+    
+    def set_context(
+        self, 
+        correlation_id_val: Optional[str] = None,
+        user_id_val: Optional[str] = None,
+        task_id_val: Optional[str] = None,
+        operation_val: Optional[str] = None
+    ):
+        """Set context variables for the current request."""
+        if correlation_id_val:
+            correlation_id.set(correlation_id_val)
+        if user_id_val:
+            user_id.set(user_id_val)
+        if task_id_val:
+            task_id.set(task_id_val)
+        if operation_val:
+            operation.set(operation_val)
+    
+    def clear_context(self):
+        """Clear all context variables."""
+        correlation_id.set('')
+        user_id.set('')
+        task_id.set('')
+        operation.set('')
+    
+    def generate_correlation_id(self) -> str:
+        """Generate a new correlation ID."""
+        return str(uuid.uuid4())
+    
+    def log_operation_start(
+        self, 
+        operation_name: str, 
+        **kwargs
+    ):
+        """Log the start of an operation with context."""
+        logger.info(
+            f"Starting {operation_name}",
+            extra={
+                "operation": operation_name,
+                "event_type": "operation_start",
+                **kwargs
+            }
+        )
+    
+    def log_operation_end(
+        self, 
+        operation_name: str, 
+        duration_ms: float,
+        success: bool = True,
+        **kwargs
+    ):
+        """Log the end of an operation with performance metrics."""
+        logger.info(
+            f"Completed {operation_name} in {duration_ms:.2f}ms",
+            extra={
+                "operation": operation_name,
+                "event_type": "operation_end",
+                "duration_ms": duration_ms,
+                "success": success,
+                **kwargs
+            }
+        )
+    
+    def log_api_call(
+        self,
+        api_name: str,
+        endpoint: str,
+        duration_ms: float,
+        status_code: Optional[int] = None,
+        token_usage: Optional[Dict[str, int]] = None,
+        **kwargs
+    ):
+        """Log API call with performance metrics."""
+        logger.info(
+            f"API call to {api_name}",
+            extra={
+                "event_type": "api_call",
+                "api_name": api_name,
+                "endpoint": endpoint,
+                "duration_ms": duration_ms,
+                "status_code": status_code,
+                "token_usage": token_usage,
+                **kwargs
+            }
+        )
+    
+    def log_error(
+        self,
+        error: Exception,
+        operation: str,
+        context: Optional[Dict[str, Any]] = None
+    ):
+        """Log error with full context."""
+        logger.error(
+            f"Error in {operation}: {str(error)}",
+            extra={
+                "event_type": "error",
+                "operation": operation,
+                "error_type": type(error).__name__,
+                "error_message": str(error),
+                "context": context or {}
+            },
+            exc_info=True
+        )
+    
+    def log_performance(
+        self,
+        metric_name: str,
+        value: float,
+        unit: str = "ms",
+        **kwargs
+    ):
+        """Log performance metrics."""
+        logger.info(
+            f"Performance metric: {metric_name} = {value} {unit}",
+            extra={
+                "event_type": "performance",
+                "metric_name": metric_name,
+                "value": value,
+                "unit": unit,
+                **kwargs
+            }
+        )
+
+
+# Global logger instance
+blog_writer_logger = BlogWriterLogger()
+
+
+def get_logger(name: str = "blog_writer"):
+    """Get a logger instance with the given name."""
+    return logger.bind(name=name)
+
+
+def log_function_call(func_name: str, **kwargs):
+    """Decorator to log function calls with timing."""
+    def decorator(func):
+        async def async_wrapper(*args, **func_kwargs):
+            start_time = time.time()
+            correlation_id_val = correlation_id.get('')
+            
+            blog_writer_logger.log_operation_start(
+                func_name,
+                function=func.__name__,
+                correlation_id=correlation_id_val,
+                **kwargs
+            )
+            
+            try:
+                result = await func(*args, **func_kwargs)
+                duration_ms = (time.time() - start_time) * 1000
+                
+                blog_writer_logger.log_operation_end(
+                    func_name,
+                    duration_ms,
+                    success=True,
+                    function=func.__name__,
+                    correlation_id=correlation_id_val
+                )
+                
+                return result
+            except Exception as e:
+                duration_ms = (time.time() - start_time) * 1000
+                
+                blog_writer_logger.log_error(
+                    e,
+                    func_name,
+                    context={
+                        "function": func.__name__,
+                        "duration_ms": duration_ms,
+                        "correlation_id": correlation_id_val
+                    }
+                )
+                raise
+        
+        def sync_wrapper(*args, **func_kwargs):
+            start_time = time.time()
+            correlation_id_val = correlation_id.get('')
+            
+            blog_writer_logger.log_operation_start(
+                func_name,
+                function=func.__name__,
+                correlation_id=correlation_id_val,
+                **kwargs
+            )
+            
+            try:
+                result = func(*args, **func_kwargs)
+                duration_ms = (time.time() - start_time) * 1000
+                
+                blog_writer_logger.log_operation_end(
+                    func_name,
+                    duration_ms,
+                    success=True,
+                    function=func.__name__,
+                    correlation_id=correlation_id_val
+                )
+                
+                return result
+            except Exception as e:
+                duration_ms = (time.time() - start_time) * 1000
+                
+                blog_writer_logger.log_error(
+                    e,
+                    func_name,
+                    context={
+                        "function": func.__name__,
+                        "duration_ms": duration_ms,
+                        "correlation_id": correlation_id_val
+                    }
+                )
+                raise
+        
+        # Return appropriate wrapper based on function type
+        import asyncio
+        if asyncio.iscoroutinefunction(func):
+            return async_wrapper
+        else:
+            return sync_wrapper
+    
+    return decorator
--- a/backend/services/blog_writer/research/research_service.py
+++ b/backend/services/blog_writer/research/research_service.py
@@ -16,6 +16,7 @@ from models.blog_models import (
    GroundingSupport,
    Citation,
 )
+from services.blog_writer.logger_config import blog_writer_logger, log_function_call

 from .keyword_analyzer import KeywordAnalyzer
 from .competitor_analyzer import CompetitorAnalyzer
@@ -32,6 +33,7 @@ class ResearchService:
        self.content_angle_generator = ContentAngleGenerator()
        self.data_filter = ResearchDataFilter()
    
+    @log_function_call("research_operation")
    async def research(self, request: BlogResearchRequest) -> BlogResearchResponse:
        """
        Stage 1: Research & Strategy (AI Orchestration)
@@ -47,6 +49,16 @@ class ResearchService:
            industry = request.industry or (request.persona.industry if request.persona and request.persona.industry else "General")
            target_audience = getattr(request.persona, 'target_audience', 'General') if request.persona else 'General'
            
+            # Log research parameters
+            blog_writer_logger.log_operation_start(
+                "research",
+                topic=topic,
+                industry=industry,
+                target_audience=target_audience,
+                keywords=request.keywords,
+                keyword_count=len(request.keywords)
+            )
+            
            # Check cache first for exact keyword match
            cached_result = research_cache.get_cached_result(
                keywords=request.keywords,
@@ -56,10 +68,12 @@ class ResearchService:
            
            if cached_result:
                logger.info(f"Returning cached research result for keywords: {request.keywords}")
+                blog_writer_logger.log_operation_end("research", 0, success=True, cache_hit=True)
                return BlogResearchResponse(**cached_result)
            
            # Cache miss - proceed with API call
            logger.info(f"Cache miss - making API call for keywords: {request.keywords}")
+            blog_writer_logger.log_operation_start("gemini_api_call", api_name="gemini_grounded", operation="research")
            gemini = GeminiGroundedProvider()

            # Single comprehensive research prompt - Gemini handles Google Search automatically
@@ -82,11 +96,23 @@ class ResearchService:
            """
            
            # Single Gemini call with native Google Search grounding - no fallbacks
+            import time
+            api_start_time = time.time()
            gemini_result = await gemini.generate_grounded_content(
                prompt=research_prompt,
                content_type="research",
                max_tokens=2000
            )
+            api_duration_ms = (time.time() - api_start_time) * 1000
+            
+            # Log API call performance
+            blog_writer_logger.log_api_call(
+                "gemini_grounded",
+                "generate_grounded_content",
+                api_duration_ms,
+                token_usage=gemini_result.get("token_usage", {}),
+                content_length=len(gemini_result.get("content", ""))
+            )
            
            # Extract sources from grounding metadata
            sources = self._extract_sources_from_grounding(gemini_result)
@@ -105,6 +131,17 @@ class ResearchService:
            suggested_angles = self.content_angle_generator.generate(content, topic, industry)
            
            logger.info(f"Research completed successfully with {len(sources)} sources and {len(search_queries)} search queries")
+            
+            # Log analysis results
+            blog_writer_logger.log_performance(
+                "research_analysis",
+                len(content),
+                "characters",
+                sources_count=len(sources),
+                search_queries_count=len(search_queries),
+                keyword_analysis_keys=len(keyword_analysis),
+                suggested_angles_count=len(suggested_angles)
+            )

            # Create the response
            response = BlogResearchResponse(
@@ -146,7 +183,47 @@ class ResearchService:
            error_message = str(e)
            logger.error(f"Research failed: {error_message}")
            
-            # Return a graceful failure response instead of raising
+            # Log error with full context
+            blog_writer_logger.log_error(
+                e,
+                "research",
+                context={
+                    "topic": topic,
+                    "keywords": request.keywords,
+                    "industry": industry,
+                    "target_audience": target_audience
+                }
+            )
+            
+            # Import custom exceptions for better error handling
+            from services.blog_writer.exceptions import (
+                ResearchFailedException, 
+                APIRateLimitException, 
+                APITimeoutException,
+                ValidationException
+            )
+            
+            # Determine if this is a retryable error
+            retry_suggested = True
+            user_message = "Research failed. Please try again with different keywords or check your internet connection."
+            
+            if isinstance(e, APIRateLimitException):
+                retry_suggested = True
+                user_message = f"Rate limit exceeded. Please wait {e.context.get('retry_after', 60)} seconds before trying again."
+            elif isinstance(e, APITimeoutException):
+                retry_suggested = True
+                user_message = "Research request timed out. Please try again with a shorter query or check your internet connection."
+            elif isinstance(e, ValidationException):
+                retry_suggested = False
+                user_message = "Invalid research request. Please check your input parameters and try again."
+            elif "401" in error_message or "403" in error_message:
+                retry_suggested = False
+                user_message = "Authentication failed. Please check your API credentials."
+            elif "400" in error_message:
+                retry_suggested = False
+                user_message = "Invalid request. Please check your input parameters."
+            
+            # Return a graceful failure response with enhanced error information
            return BlogResearchResponse(
                success=False,
                sources=[],
@@ -155,9 +232,18 @@ class ResearchService:
                suggested_angles=[],
                search_widget="",
                search_queries=[],
-                error_message=error_message
+                error_message=user_message,
+                retry_suggested=retry_suggested,
+                error_code=getattr(e, 'error_code', 'RESEARCH_FAILED'),
+                actionable_steps=getattr(e, 'actionable_steps', [
+                    "Try with different keywords",
+                    "Check your internet connection",
+                    "Wait a few minutes and try again",
+                    "Contact support if the issue persists"
+                ])
            )
    
+    @log_function_call("research_with_progress")
    async def research_with_progress(self, request: BlogResearchRequest, task_id: str) -> BlogResearchResponse:
        """
        Research method with progress updates for real-time feedback.
@@ -291,7 +377,47 @@ class ResearchService:
            error_message = str(e)
            logger.error(f"Research failed: {error_message}")
            
-            # Return a graceful failure response instead of raising
+            # Log error with full context
+            blog_writer_logger.log_error(
+                e,
+                "research",
+                context={
+                    "topic": topic,
+                    "keywords": request.keywords,
+                    "industry": industry,
+                    "target_audience": target_audience
+                }
+            )
+            
+            # Import custom exceptions for better error handling
+            from services.blog_writer.exceptions import (
+                ResearchFailedException, 
+                APIRateLimitException, 
+                APITimeoutException,
+                ValidationException
+            )
+            
+            # Determine if this is a retryable error
+            retry_suggested = True
+            user_message = "Research failed. Please try again with different keywords or check your internet connection."
+            
+            if isinstance(e, APIRateLimitException):
+                retry_suggested = True
+                user_message = f"Rate limit exceeded. Please wait {e.context.get('retry_after', 60)} seconds before trying again."
+            elif isinstance(e, APITimeoutException):
+                retry_suggested = True
+                user_message = "Research request timed out. Please try again with a shorter query or check your internet connection."
+            elif isinstance(e, ValidationException):
+                retry_suggested = False
+                user_message = "Invalid research request. Please check your input parameters and try again."
+            elif "401" in error_message or "403" in error_message:
+                retry_suggested = False
+                user_message = "Authentication failed. Please check your API credentials."
+            elif "400" in error_message:
+                retry_suggested = False
+                user_message = "Invalid request. Please check your input parameters."
+            
+            # Return a graceful failure response with enhanced error information
            return BlogResearchResponse(
                success=False,
                sources=[],
@@ -300,7 +426,15 @@ class ResearchService:
                suggested_angles=[],
                search_widget="",
                search_queries=[],
-                error_message=error_message
+                error_message=user_message,
+                retry_suggested=retry_suggested,
+                error_code=getattr(e, 'error_code', 'RESEARCH_FAILED'),
+                actionable_steps=getattr(e, 'actionable_steps', [
+                    "Try with different keywords",
+                    "Check your internet connection",
+                    "Wait a few minutes and try again",
+                    "Contact support if the issue persists"
+                ])
            )

    def _extract_sources_from_grounding(self, gemini_result: Dict[str, Any]) -> List[ResearchSource]:
--- a/backend/services/blog_writer/retry_utils.py
+++ b/backend/services/blog_writer/retry_utils.py
@@ -0,0 +1,223 @@
+"""
+Enhanced Retry Utilities for Blog Writer
+
+Provides advanced retry logic with exponential backoff, jitter, retry budgets,
+and specific error code handling for different types of API failures.
+"""
+
+import asyncio
+import random
+import time
+from typing import Callable, Any, Optional, Dict, List
+from dataclasses import dataclass
+from loguru import logger
+
+from .exceptions import APIRateLimitException, APITimeoutException
+
+
+@dataclass
+class RetryConfig:
+    """Configuration for retry behavior."""
+    max_attempts: int = 3
+    base_delay: float = 1.0
+    max_delay: float = 60.0
+    exponential_base: float = 2.0
+    jitter: bool = True
+    max_total_time: float = 300.0  # 5 minutes max total time
+    retryable_errors: List[str] = None
+    
+    def __post_init__(self):
+        if self.retryable_errors is None:
+            self.retryable_errors = [
+                "503", "502", "504",  # Server errors
+                "429",  # Rate limit
+                "timeout", "timed out",
+                "connection", "network",
+                "overloaded", "busy"
+            ]
+
+
+class RetryBudget:
+    """Tracks retry budget to prevent excessive retries."""
+    
+    def __init__(self, max_total_time: float):
+        self.max_total_time = max_total_time
+        self.start_time = time.time()
+        self.used_time = 0.0
+    
+    def can_retry(self) -> bool:
+        """Check if we can still retry within budget."""
+        self.used_time = time.time() - self.start_time
+        return self.used_time < self.max_total_time
+    
+    def remaining_time(self) -> float:
+        """Get remaining time in budget."""
+        return max(0, self.max_total_time - self.used_time)
+
+
+def is_retryable_error(error: Exception, retryable_errors: List[str]) -> bool:
+    """Check if an error is retryable based on error message patterns."""
+    error_str = str(error).lower()
+    return any(pattern.lower() in error_str for pattern in retryable_errors)
+
+
+def calculate_delay(attempt: int, config: RetryConfig) -> float:
+    """Calculate delay for retry attempt with exponential backoff and jitter."""
+    # Exponential backoff
+    delay = config.base_delay * (config.exponential_base ** attempt)
+    
+    # Cap at max delay
+    delay = min(delay, config.max_delay)
+    
+    # Add jitter to prevent thundering herd
+    if config.jitter:
+        jitter_range = delay * 0.1  # 10% jitter
+        delay += random.uniform(-jitter_range, jitter_range)
+    
+    return max(0, delay)
+
+
+async def retry_with_backoff(
+    func: Callable,
+    config: Optional[RetryConfig] = None,
+    operation_name: str = "operation",
+    context: Optional[Dict[str, Any]] = None
+) -> Any:
+    """
+    Retry a function with enhanced backoff and budget management.
+    
+    Args:
+        func: Async function to retry
+        config: Retry configuration
+        operation_name: Name of operation for logging
+        context: Additional context for logging
+        
+    Returns:
+        Function result
+        
+    Raises:
+        Last exception if all retries fail
+    """
+    config = config or RetryConfig()
+    budget = RetryBudget(config.max_total_time)
+    last_exception = None
+    
+    for attempt in range(config.max_attempts):
+        try:
+            # Check if we're still within budget
+            if not budget.can_retry():
+                logger.warning(f"Retry budget exceeded for {operation_name} after {budget.used_time:.2f}s")
+                break
+            
+            # Execute the function
+            result = await func()
+            logger.info(f"{operation_name} succeeded on attempt {attempt + 1}")
+            return result
+            
+        except Exception as e:
+            last_exception = e
+            
+            # Check if this is the last attempt
+            if attempt == config.max_attempts - 1:
+                logger.error(f"{operation_name} failed after {config.max_attempts} attempts: {str(e)}")
+                break
+            
+            # Check if error is retryable
+            if not is_retryable_error(e, config.retryable_errors):
+                logger.warning(f"{operation_name} failed with non-retryable error: {str(e)}")
+                break
+            
+            # Calculate delay and wait
+            delay = calculate_delay(attempt, config)
+            remaining_time = budget.remaining_time()
+            
+            # Don't wait longer than remaining budget
+            if delay > remaining_time:
+                logger.warning(f"Delay {delay:.2f}s exceeds remaining budget {remaining_time:.2f}s for {operation_name}")
+                break
+            
+            logger.warning(
+                f"{operation_name} attempt {attempt + 1} failed: {str(e)}. "
+                f"Retrying in {delay:.2f}s (attempt {attempt + 2}/{config.max_attempts})"
+            )
+            
+            await asyncio.sleep(delay)
+    
+    # If we get here, all retries failed
+    if last_exception:
+        # Enhance exception with retry context
+        if isinstance(last_exception, Exception):
+            error_str = str(last_exception)
+            if "429" in error_str or "rate limit" in error_str.lower():
+                raise APIRateLimitException(
+                    f"Rate limit exceeded after {config.max_attempts} attempts",
+                    retry_after=int(delay * 2),  # Suggest waiting longer
+                    context=context
+                )
+            elif "timeout" in error_str.lower():
+                raise APITimeoutException(
+                    f"Request timed out after {config.max_attempts} attempts",
+                    timeout_seconds=int(config.max_total_time),
+                    context=context
+                )
+        
+        raise last_exception
+    
+    raise Exception(f"{operation_name} failed after {config.max_attempts} attempts")
+
+
+def retry_decorator(
+    config: Optional[RetryConfig] = None,
+    operation_name: Optional[str] = None
+):
+    """
+    Decorator to add retry logic to async functions.
+    
+    Args:
+        config: Retry configuration
+        operation_name: Name of operation for logging
+    """
+    def decorator(func: Callable) -> Callable:
+        async def wrapper(*args, **kwargs):
+            op_name = operation_name or func.__name__
+            return await retry_with_backoff(
+                lambda: func(*args, **kwargs),
+                config=config,
+                operation_name=op_name
+            )
+        return wrapper
+    return decorator
+
+
+# Predefined retry configurations for different operation types
+RESEARCH_RETRY_CONFIG = RetryConfig(
+    max_attempts=3,
+    base_delay=2.0,
+    max_delay=30.0,
+    max_total_time=180.0,  # 3 minutes for research
+    retryable_errors=["503", "429", "timeout", "overloaded", "connection"]
+)
+
+OUTLINE_RETRY_CONFIG = RetryConfig(
+    max_attempts=2,
+    base_delay=1.5,
+    max_delay=20.0,
+    max_total_time=120.0,  # 2 minutes for outline
+    retryable_errors=["503", "429", "timeout", "overloaded"]
+)
+
+CONTENT_RETRY_CONFIG = RetryConfig(
+    max_attempts=3,
+    base_delay=1.0,
+    max_delay=15.0,
+    max_total_time=90.0,  # 1.5 minutes for content
+    retryable_errors=["503", "429", "timeout", "overloaded"]
+)
+
+SEO_RETRY_CONFIG = RetryConfig(
+    max_attempts=2,
+    base_delay=1.0,
+    max_delay=10.0,
+    max_total_time=60.0,  # 1 minute for SEO
+    retryable_errors=["503", "429", "timeout"]
+)