Updated SEO Analysis Modal

2025-09-22 21:02:32 +05:30
parent f98d49cea7
commit 12119d418b
38 changed files with 5742 additions and 2337 deletions
--- a/backend/services/blog_writer/content/medium_blog_generator.py
+++ b/backend/services/blog_writer/content/medium_blog_generator.py
@@ -0,0 +1,237 @@
+"""
+Medium Blog Generator Service
+
+Handles generation of medium-length blogs (≤1000 words) using structured AI calls.
+"""
+
+import time
+import json
+from typing import Dict, Any, List
+from loguru import logger
+
+from models.blog_models import (
+    MediumBlogGenerateRequest,
+    MediumBlogGenerateResult,
+    MediumGeneratedSection,
+    ResearchSource,
+)
+from services.llm_providers.gemini_provider import gemini_structured_json_response
+from services.cache.persistent_content_cache import persistent_content_cache
+
+
+class MediumBlogGenerator:
+    """Service for generating medium-length blog content using structured AI calls."""
+    
+    def __init__(self):
+        self.cache = persistent_content_cache
+    
+    async def generate_medium_blog_with_progress(self, req: MediumBlogGenerateRequest, task_id: str) -> MediumBlogGenerateResult:
+        """Use Gemini structured JSON to generate a medium-length blog in one call."""
+        import time
+        start = time.time()
+
+        # Prepare sections data for cache key generation
+        sections_for_cache = []
+        for s in req.sections:
+            sections_for_cache.append({
+                "id": s.id,
+                "heading": s.heading,
+                "keyPoints": getattr(s, "key_points", []) or getattr(s, "keyPoints", []),
+                "subheadings": getattr(s, "subheadings", []),
+                "keywords": getattr(s, "keywords", []),
+                "targetWords": getattr(s, "target_words", None) or getattr(s, "targetWords", None),
+            })
+
+        # Check cache first
+        cached_result = self.cache.get_cached_content(
+            keywords=req.researchKeywords or [],
+            sections=sections_for_cache,
+            global_target_words=req.globalTargetWords or 1000,
+            persona_data=req.persona.dict() if req.persona else None,
+            tone=req.tone,
+            audience=req.audience
+        )
+        
+        if cached_result:
+            logger.info(f"Using cached content for keywords: {req.researchKeywords} (saved expensive generation)")
+            # Add cache hit marker to distinguish from fresh generation
+            cached_result['generation_time_ms'] = 0  # Mark as cache hit
+            cached_result['cache_hit'] = True
+            return MediumBlogGenerateResult(**cached_result)
+
+        # Cache miss - proceed with AI generation
+        logger.info(f"Cache miss - generating new content for keywords: {req.researchKeywords}")
+
+        # Build schema expected from the model
+        schema = {
+            "type": "object",
+            "properties": {
+                "title": {"type": "string"},
+                "sections": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "id": {"type": "string"},
+                            "heading": {"type": "string"},
+                            "content": {"type": "string"},
+                            "wordCount": {"type": "number"},
+                            "sources": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {"title": {"type": "string"}, "url": {"type": "string"}},
+                                },
+                            },
+                        },
+                    },
+                },
+            },
+        }
+
+        # Compose prompt
+        def section_block(s):
+            return {
+                "id": s.id,
+                "heading": s.heading,
+                "outline": {
+                    "keyPoints": getattr(s, "key_points", []) or getattr(s, "keyPoints", []),
+                    "subheadings": getattr(s, "subheadings", []),
+                    "keywords": getattr(s, "keywords", []),
+                    "targetWords": getattr(s, "target_words", None) or getattr(s, "targetWords", None),
+                    "references": [
+                        {"title": r.title, "url": r.url} for r in getattr(s, "references", [])
+                    ],
+                },
+            }
+
+        payload = {
+            "title": req.title,
+            "globalTargetWords": req.globalTargetWords or 1000,
+            "persona": req.persona.dict() if req.persona else None,
+            "tone": req.tone,
+            "audience": req.audience,
+            "sections": [section_block(s) for s in req.sections],
+        }
+
+        # Build persona-aware system prompt
+        persona_context = ""
+        if req.persona:
+            persona_context = f"""
+            PERSONA GUIDELINES:
+            - Industry: {req.persona.industry or 'General'}
+            - Tone: {req.persona.tone or 'Professional'}
+            - Audience: {req.persona.audience or 'General readers'}
+            - Persona ID: {req.persona.persona_id or 'Default'}
+            
+            Write content that reflects this persona's expertise and communication style.
+            Use industry-specific terminology and examples where appropriate.
+            Maintain consistent voice and authority throughout all sections.
+            """
+        
+        system = (
+            "You are a professional blog writer with deep expertise in your field. "
+            "Generate high-quality, persona-driven content for each section based on the provided outline. "
+            "Write engaging, informative content that follows the section's key points and target word count. "
+            "Ensure the content flows naturally and maintains consistent voice and authority. "
+            "Format content with proper paragraph breaks using double line breaks (\\n\\n) between paragraphs. "
+            "Structure content with clear paragraphs - aim for 2-4 sentences per paragraph. "
+            f"{persona_context}"
+            "Return ONLY valid JSON with no markdown formatting or explanations."
+        )
+
+        # Build persona-specific content instructions
+        persona_instructions = ""
+        if req.persona:
+            industry = req.persona.industry or 'General'
+            tone = req.persona.tone or 'Professional'
+            audience = req.persona.audience or 'General readers'
+            
+            persona_instructions = f"""
+            PERSONA-DRIVEN CONTENT REQUIREMENTS:
+            - Write as an expert in {industry} industry
+            - Use {tone} tone appropriate for {audience}
+            - Include industry-specific examples and terminology
+            - Demonstrate authority and expertise in the field
+            - Use language that resonates with {audience}
+            - Maintain consistent voice that reflects this persona's expertise
+            """
+        
+        prompt = (
+            f"Write blog content for the following sections. Each section should be {req.globalTargetWords or 1000} words total, distributed across all sections.\n\n"
+            f"Blog Title: {req.title}\n\n"
+            "For each section, write engaging content that:\n"
+            "- Follows the key points provided\n"
+            "- Uses the suggested keywords naturally\n"
+            "- Meets the target word count\n"
+            "- Maintains professional tone\n"
+            "- References the provided sources when relevant\n"
+            "- Breaks content into clear paragraphs (2-4 sentences each)\n"
+            "- Uses double line breaks (\\n\\n) between paragraphs for proper formatting\n"
+            "- Starts with an engaging opening paragraph\n"
+            "- Ends with a strong concluding paragraph\n"
+            f"{persona_instructions}\n"
+            "IMPORTANT: Format the 'content' field with proper paragraph breaks using \\n\\n between paragraphs.\n\n"
+            "Return a JSON object with 'title' and 'sections' array. Each section should have 'id', 'heading', 'content', and 'wordCount'.\n\n"
+            f"Sections to write:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
+        )
+
+        ai_resp = gemini_structured_json_response(
+            prompt=prompt,
+            schema=schema,
+            temperature=0.2,
+            max_tokens=8192,
+            system_prompt=system,
+        )
+
+        # Check for errors in AI response
+        if not ai_resp or ai_resp.get("error"):
+            error_msg = ai_resp.get("error", "Empty generation result from model") if ai_resp else "No response from model"
+            logger.error(f"AI generation failed: {error_msg}")
+            raise Exception(f"AI generation failed: {error_msg}")
+
+        # Normalize output
+        title = ai_resp.get("title") or req.title
+        out_sections = []
+        for s in ai_resp.get("sections", []) or []:
+            out_sections.append(
+                MediumGeneratedSection(
+                    id=str(s.get("id")),
+                    heading=s.get("heading") or "",
+                    content=s.get("content") or "",
+                    wordCount=int(s.get("wordCount") or 0),
+                    sources=[
+                        # map to ResearchSource shape if possible; keep minimal
+                        ResearchSource(title=src.get("title", ""), url=src.get("url", ""))
+                        for src in (s.get("sources") or [])
+                    ] or None,
+                )
+            )
+
+        duration_ms = int((time.time() - start) * 1000)
+        result = MediumBlogGenerateResult(
+            success=True,
+            title=title,
+            sections=out_sections,
+            model="gemini-2.5-flash",
+            generation_time_ms=duration_ms,
+            safety_flags=None,
+        )
+        
+        # Cache the result for future use
+        try:
+            self.cache.cache_content(
+                keywords=req.researchKeywords or [],
+                sections=sections_for_cache,
+                global_target_words=req.globalTargetWords or 1000,
+                persona_data=req.persona.dict() if req.persona else None,
+                tone=req.tone or "professional",
+                audience=req.audience or "general",
+                result=result.dict()
+            )
+            logger.info(f"Cached content result for keywords: {req.researchKeywords}")
+        except Exception as cache_error:
+            logger.warning(f"Failed to cache content result: {cache_error}")
+            # Don't fail the entire operation if caching fails
+        
+        return result