ALwrity/backend/services/blog_writer/content/medium_blog_generator.py

"""
Medium Blog Generator Service

Handles generation of medium-length blogs (≤1000 words) using structured AI calls.
"""

import time
import json
from typing import Dict, Any, List
from loguru import logger
from fastapi import HTTPException
from sqlalchemy.orm import Session

from models.blog_models import (
    MediumBlogGenerateRequest,
    MediumBlogGenerateResult,
    MediumGeneratedSection,
    ResearchSource,
)
from services.llm_providers.main_text_generation import llm_text_gen
from services.cache.persistent_content_cache import persistent_content_cache


class MediumBlogGenerator:
    """Service for generating medium-length blog content using structured AI calls."""

    def __init__(self):
        self.cache = persistent_content_cache

    async def generate_medium_blog_with_progress(self, req: MediumBlogGenerateRequest, task_id: str, user_id: str, db: Session = None) -> MediumBlogGenerateResult:
        """Use Gemini structured JSON to generate a medium-length blog in one call.

        Args:
            req: Medium blog generation request
            task_id: Task ID for progress updates
            user_id: User ID (required for subscription checks and usage tracking)

        Raises:
            ValueError: If user_id is not provided
        """
        if not user_id:
            raise ValueError("user_id is required for medium blog generation (subscription checks and usage tracking)")

        import time
        start = time.time()

        # Prepare sections data for cache key generation
        sections_for_cache = []
        for s in req.sections:
            sections_for_cache.append({
                "id": s.id,
                "heading": s.heading,
                "keyPoints": getattr(s, "key_points", []) or getattr(s, "keyPoints", []),
                "subheadings": getattr(s, "subheadings", []),
                "keywords": getattr(s, "keywords", []),
                "targetWords": getattr(s, "target_words", None) or getattr(s, "targetWords", None),
            })

        # Check cache first
        cached_result = self.cache.get_cached_content(
            keywords=req.researchKeywords or [],
            sections=sections_for_cache,
            global_target_words=req.globalTargetWords or 1000,
            persona_data=req.persona.dict() if req.persona else None,
            tone=req.tone,
            audience=req.audience
        )

        if cached_result:
            logger.info(f"Using cached content for keywords: {req.researchKeywords} (saved expensive generation)")
            # Add cache hit marker to distinguish from fresh generation
            cached_result['generation_time_ms'] = 0  # Mark as cache hit
            cached_result['cache_hit'] = True
            return MediumBlogGenerateResult(**cached_result)

        # Cache miss - proceed with AI generation
        logger.info(f"Cache miss - generating new content for keywords: {req.researchKeywords}")

        # Build schema expected from the model
        schema = {
            "type": "object",
            "properties": {
                "title": {"type": "string"},
                "sections": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "id": {"type": "string"},
                            "heading": {"type": "string"},
                            "content": {"type": "string"},
                            "wordCount": {"type": "number"},
                            "sources": {
                                "type": "array",
                                "items": {
                                    "type": "object",
                                    "properties": {"title": {"type": "string"}, "url": {"type": "string"}},
                                },
                            },
                        },
                    },
                },
            },
        }

        # Compose prompt
        def section_block(s):
            return {
                "id": s.id,
                "heading": s.heading,
                "outline": {
                    "keyPoints": getattr(s, "key_points", []) or getattr(s, "keyPoints", []),
                    "subheadings": getattr(s, "subheadings", []),
                    "keywords": getattr(s, "keywords", []),
                    "targetWords": getattr(s, "target_words", None) or getattr(s, "targetWords", None),
                    "references": [
                        {"title": r.title, "url": r.url} for r in getattr(s, "references", [])
                    ],
                },
            }

        payload = {
            "title": req.title,
            "globalTargetWords": req.globalTargetWords or 1000,
            "persona": req.persona.dict() if req.persona else None,
            "tone": req.tone,
            "audience": req.audience,
            "sections": [section_block(s) for s in req.sections],
        }

        # Build persona-aware system prompt
        persona_context = ""
        if req.persona:
            persona_context = f"""
            PERSONA GUIDELINES:
            - Industry: {req.persona.industry or 'General'}
            - Tone: {req.persona.tone or 'Professional'}
            - Audience: {req.persona.audience or 'General readers'}
            - Persona ID: {req.persona.persona_id or 'Default'}

            Write content that reflects this persona's expertise and communication style.
            Use industry-specific terminology and examples where appropriate.
            Maintain consistent voice and authority throughout all sections.
            """

        system = (
            "You are a professional blog writer with deep expertise in your field. "
            "Generate high-quality, persona-driven content for each section based on the provided outline. "
            "Write engaging, informative content that follows the section's key points and target word count. "
            "Ensure the content flows naturally and maintains consistent voice and authority. "
            "Format content with proper paragraph breaks using double line breaks (\\n\\n) between paragraphs. "
            "Structure content with clear paragraphs - aim for 2-4 sentences per paragraph. "
            f"{persona_context}"
            "Return ONLY valid JSON with no markdown formatting or explanations."
        )

        # Build persona-specific content instructions
        persona_instructions = ""
        if req.persona:
            industry = req.persona.industry or 'General'
            tone = req.persona.tone or 'Professional'
            audience = req.persona.audience or 'General readers'

            persona_instructions = f"""
            PERSONA-DRIVEN CONTENT REQUIREMENTS:
            - Write as an expert in {industry} industry
            - Use {tone} tone appropriate for {audience}
            - Include industry-specific examples and terminology
            - Demonstrate authority and expertise in the field
            - Use language that resonates with {audience}
            - Maintain consistent voice that reflects this persona's expertise
            """

        prompt = (
            f"Write blog content for the following sections. Each section should be {req.globalTargetWords or 1000} words total, distributed across all sections.\n\n"
            f"Blog Title: {req.title}\n\n"
            "For each section, write engaging content that:\n"
            "- Follows the key points provided\n"
            "- Uses the suggested keywords naturally\n"
            "- Meets the target word count\n"
            "- Maintains professional tone\n"
            "- References the provided sources when relevant\n"
            "- Breaks content into clear paragraphs (2-4 sentences each)\n"
            "- Uses double line breaks (\\n\\n) between paragraphs for proper formatting\n"
            "- Starts with an engaging opening paragraph\n"
            "- Ends with a strong concluding paragraph\n"
            f"{persona_instructions}\n"
            "IMPORTANT: Format the 'content' field with proper paragraph breaks using \\n\\n between paragraphs.\n\n"
            "Return a JSON object with 'title' and 'sections' array. Each section should have 'id', 'heading', 'content', and 'wordCount'.\n\n"
            f"Sections to write:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
        )

        try:
            ai_resp = llm_text_gen(
                prompt=prompt,
                json_struct=schema,
                system_prompt=system,
                user_id=user_id
            )
        except HTTPException:
            # Re-raise HTTPExceptions (e.g., 429 subscription limit) to preserve error details
            raise
        except Exception as llm_error:
            # Wrap other errors
            logger.error(f"AI generation failed: {llm_error}")
            raise Exception(f"AI generation failed: {str(llm_error)}")

        # Check for errors in AI response
        if not ai_resp or ai_resp.get("error"):
            error_msg = ai_resp.get("error", "Empty generation result from model") if ai_resp else "No response from model"
            logger.error(f"AI generation failed: {error_msg}")
            raise Exception(f"AI generation failed: {error_msg}")

        # Normalize output
        title = ai_resp.get("title") or req.title
        out_sections = []
        for s in ai_resp.get("sections", []) or []:
            out_sections.append(
                MediumGeneratedSection(
                    id=str(s.get("id")),
                    heading=s.get("heading") or "",
                    content=s.get("content") or "",
                    wordCount=int(s.get("wordCount") or 0),
                    sources=[
                        # map to ResearchSource shape if possible; keep minimal
                        ResearchSource(title=src.get("title", ""), url=src.get("url", ""))
                        for src in (s.get("sources") or [])
                    ] or None,
                )
            )

        duration_ms = int((time.time() - start) * 1000)
        result = MediumBlogGenerateResult(
            success=True,
            title=title,
            sections=out_sections,
            model="gemini-2.5-flash",
            generation_time_ms=duration_ms,
            safety_flags=None,
        )

        # Cache the result for future use
        try:
            self.cache.cache_content(
                keywords=req.researchKeywords or [],
                sections=sections_for_cache,
                global_target_words=req.globalTargetWords or 1000,
                persona_data=req.persona.dict() if req.persona else None,
                tone=req.tone or "professional",
                audience=req.audience or "general",
                result=result.dict()
            )
            logger.info(f"Cached content result for keywords: {req.researchKeywords}")
        except Exception as cache_error:
            logger.warning(f"Failed to cache content result: {cache_error}")
            # Don't fail the entire operation if caching fails

        # Save content to user workspace if db session is available
        if user_id and db:
            try:
                # Construct full blog content
                full_content = f"# {result.title}\n\n"
                for section in result.sections:
                    full_content += f"## {section.heading}\n\n"
                    full_content += f"{section.content}\n\n"

                # Save to workspace
                save_and_track_text_content(
                    db=db,
                    user_id=user_id,
                    content=full_content,
                    source_module="medium_blog_writer",
                    title=result.title,
                    description=f"Generated medium blog: {result.title}",
                    tags=req.researchKeywords or ["medium_blog", "ai_generated"],
                    asset_metadata={
                        "model": result.model,
                        "generation_time_ms": result.generation_time_ms,
                        "word_count": sum(s.wordCount for s in result.sections)
                    },
                    subdirectory="medium_blogs"
                )
                logger.info(f"Saved medium blog content to user workspace for user {user_id}")
            except Exception as e:
                logger.error(f"Failed to save medium blog content to workspace: {e}")
        elif not db:
             logger.warning("Database session not provided, skipping workspace save for medium blog")

        return result