ALwrity/backend/services/blog_writer/content/enhanced_content_generator.py

"""
EnhancedContentGenerator - thin orchestrator for section generation.

Provider parity:
- Uses main_text_generation.llm_text_gen to respect GPT_PROVIDER (Gemini/HF)
- No direct provider coupling here; Google grounding remains in research only
"""

from typing import Any, Dict, List

from services.llm_providers.main_text_generation import llm_text_gen
from .source_url_manager import SourceURLManager
from .context_memory import ContextMemory
from .transition_generator import TransitionGenerator
from .flow_analyzer import FlowAnalyzer


class EnhancedContentGenerator:
    def __init__(self):
        self.url_manager = SourceURLManager()
        self.memory = ContextMemory(max_entries=12)
        self.transitioner = TransitionGenerator()
        self.flow = FlowAnalyzer()

    async def generate_section(self, section: Any, research: Any = None, mode: str = "polished", user_id: str = None, competitive_advantage: str = "") -> Dict[str, Any]:
        prev_summary = self.memory.build_previous_sections_summary(limit=2)
        research_context, section_sources = self._build_research_context(section)
        urls = self.url_manager.pick_relevant_urls(section, research) if not research_context else []
        global_research_context = self._build_global_research_context(research, competitive_advantage)
        prompt = self._build_prompt(section, prev_summary, research_context, urls, global_research_context)
        content_text: str = ""
        try:
            ai_resp = llm_text_gen(
                prompt=prompt,
                json_struct=None,
                system_prompt=None,
                user_id=user_id
            )
            if isinstance(ai_resp, dict) and ai_resp.get("text"):
                content_text = ai_resp.get("text", "")
            elif isinstance(ai_resp, str):
                content_text = ai_resp
            else:
                content_text = str(ai_resp or "")
        except Exception as e:
            content_text = ""

        result = {
            "content": content_text,
            "sources": section_sources,
        }
        previous_text = prev_summary
        current_text = result.get("content", "")
        transition = self.transitioner.generate_transition(previous_text, getattr(section, 'heading', 'This section'), use_llm=True)
        metrics = self.flow.assess_flow(previous_text, current_text, use_llm=True)
        if current_text:
            self.memory.update_with_section(getattr(section, 'id', 'unknown'), current_text, use_llm=True)
        result["transition"] = transition
        result["continuity_metrics"] = metrics
        try:
            sid = getattr(section, 'id', 'unknown')
            if not hasattr(self, "_last_continuity"):
                self._last_continuity = {}
            self._last_continuity[sid] = metrics
        except Exception:
            pass
        return result

    def _build_research_context(self, section: Any) -> tuple:
        """Build a rich research context block from the section's mapped sources.

        Returns (context_string, sources_list) where context_string is the
        formatted research context for the prompt, and sources_list contains
        {title, url} dicts for downstream use.

        When section.references is empty, returns ("", []) — the caller should
        handle this as a research gap and avoid generating unsupported claims.
        """
        references = getattr(section, 'references', []) or []
        if not references:
            return ("", [])

        context_parts = []
        sources_out = []
        for i, ref in enumerate(references, 1):
            if isinstance(ref, dict):
                title = ref.get('title', '')
                excerpt = ref.get('excerpt', '')
                highlights = ref.get('highlights', []) or []
                summary = ref.get('summary', '')
                url = ref.get('url', '')
                content = ref.get('content', '') or ''
                author = ref.get('author', '') or ''
                source_type = ref.get('source_type', '') or ''
                credibility_score = ref.get('credibility_score')
                published_at = ref.get('published_at', '') or ''
            else:
                title = getattr(ref, 'title', '')
                excerpt = getattr(ref, 'excerpt', '')
                highlights = getattr(ref, 'highlights', []) or []
                summary = getattr(ref, 'summary', '')
                url = getattr(ref, 'url', '')
                content = getattr(ref, 'content', '') or ''
                author = getattr(ref, 'author', '') or ''
                source_type = getattr(ref, 'source_type', '') or ''
                credibility_score = getattr(ref, 'credibility_score', None)
                published_at = getattr(ref, 'published_at', '') or ''

            sources_out.append({"title": title, "url": url})

            attribution_parts = []
            if author:
                attribution_parts.append(f"by {author}")
            if source_type:
                attribution_parts.append(f"[{source_type}]")
            attribution = " ".join(attribution_parts)
            credibility_tag = ""
            if credibility_score is not None:
                try:
                    score = float(credibility_score)
                    if score >= 0.9:
                        credibility_tag = " (high-credibility)"
                    elif score >= 0.75:
                        credibility_tag = " (credible)"
                except (ValueError, TypeError):
                    pass
            recency_tag = ""
            if published_at:
                recency_tag = f" (published {published_at[:10]})" if len(published_at) >= 10 else f" (published {published_at})"

            header = f"Source {i}: {title}"
            if attribution:
                header += f" {attribution}"
            header += f"{credibility_tag}{recency_tag}"
            part = header + "\n"
            if summary:
                part += f"  Summary: {summary[:1000]}\n"
            if excerpt:
                part += f"  Key excerpt: {excerpt[:1000]}\n"
            if content and not summary and not excerpt:
                part += f"  Content: {content[:800]}\n"
            if highlights:
                part += "  Key findings:\n"
                for h in highlights[:3]:
                    h_text = h[:500] if h else ''
                    if h_text:
                        part += f"  - {h_text}\n"

            context_parts.append(part)

        return ("\n".join(context_parts), sources_out)

    def _build_global_research_context(self, research: Any, competitive_advantage: str = "") -> str:
        """Build global research context from the full BlogResearchResponse object.

        Extracts keyword_analysis, competitor_analysis, search_queries,
        and competitive_advantage into a compact context block that provides
        the LLM with strategic direction beyond per-section sources.
        """
        if research is None:
            return ""
        parts = []

        ka = getattr(research, 'keyword_analysis', None) or {}
        if ka:
            primary = ka.get('primary', [])
            secondary = ka.get('secondary', [])
            search_intent = ka.get('search_intent', '')
            kw_lines = []
            if primary:
                kw_lines.append(f"Primary keywords: {', '.join(primary[:10])}")
            if secondary:
                kw_lines.append(f"Secondary keywords: {', '.join(secondary[:10])}")
            if search_intent:
                kw_lines.append(f"Search intent: {search_intent}")
            if kw_lines:
                parts.append("=== KEYWORD & SEARCH STRATEGY ===\n" + "\n".join(kw_lines))

        ca = getattr(research, 'competitor_analysis', None) or {}
        if ca:
            ca_lines = []
            content_gaps = ca.get('content_gaps', [])
            if content_gaps:
                ca_lines.append(f"Content gaps (address these): {', '.join(content_gaps[:5])}")
            industry_leaders = ca.get('industry_leaders', [])
            if industry_leaders:
                ca_lines.append(f"Industry leaders: {', '.join(industry_leaders[:5])}")
            opportunities = ca.get('opportunities', [])
            if opportunities:
                ca_lines.append(f"Opportunities: {', '.join(opportunities[:5])}")
            if ca_lines:
                parts.append("=== COMPETITIVE LANDSCAPE ===\n" + "\n".join(ca_lines))

        sq = getattr(research, 'search_queries', None) or []
        if sq:
            parts.append(f"=== SEARCH INTENT SIGNALS ===\nOriginal search queries: {', '.join(sq[:8])}")

        if competitive_advantage:
            parts.append(f"=== COMPETITIVE ADVANTAGE ===\nEmphasize this differentiator: {competitive_advantage}")

        return "\n\n".join(parts) if parts else ""

    def _build_prompt(self, section: Any, prev_summary: str, research_context: str, urls: list, global_research_context: str = "") -> str:
        heading = getattr(section, 'heading', 'Section')
        key_points = getattr(section, 'key_points', [])
        keywords = getattr(section, 'keywords', [])
        subheadings = getattr(section, 'subheadings', []) or []
        target_words = getattr(section, 'target_words', 300)

        prompt = (
            f"You are writing the blog section '{heading}'.\n\n"
            f"Context summary (previous sections): {prev_summary}\n\n"
            f"Authoring requirements:\n"
            f"- Target word count: ~{target_words}\n"
            f"- Use the following key points: {', '.join(key_points)}\n"
            f"- Include these keywords naturally: {', '.join(keywords)}\n"
        )

        if subheadings:
            prompt += f"- Cover these subtopics: {', '.join(subheadings)}\n"

        if global_research_context:
            prompt += f"\n{global_research_context}\n\n"

        if research_context:
            prompt += (
                f"\nResearch sources for this section (use these facts, statistics, "
                f"and insights to support your writing):\n{research_context}\n\n"
                "IMPORTANT: Base your writing on the research sources above. "
                "Use specific facts, statistics, and data from these sources. "
                "Do not invent numbers, statistics, or claims not supported by the research.\n"
            )
        elif urls:
            import logging
            logging.getLogger('content_generator').warning(
                f"No research context for section '{heading}' — falling back to bare URLs"
            )
            url_lines = []
            for u in urls:
                if isinstance(u, dict):
                    url_lines.append(f"- {u.get('title','')} ({u.get('url','')})")
                else:
                    url_lines.append(f"- {u}")
            prompt += f"\nReference URLs (consult for additional context):\n" + "\n".join(url_lines) + "\n"

        prompt += (
            "\nWrite engaging, well-structured markdown with clear paragraphs "
            "(2-4 sentences each) separated by double line breaks."
        )

        return prompt