moreminimore-marketing/backend/services/persona_replication_engine.py

"""
Persona Replication Engine
Implements the hardened persona replication system for high-fidelity content generation.
Based on quantitative analysis and structured constraints.
"""

from typing import Dict, Any, List, Optional
from loguru import logger
import json

from services.llm_providers.gemini_provider import gemini_structured_json_response
from services.persona_analysis_service import PersonaAnalysisService

class PersonaReplicationEngine:
    """
    High-fidelity persona replication engine that generates content
    indistinguishable from the original author's work.
    """

    def __init__(self):
        """Initialize the persona replication engine."""
        self.persona_service = PersonaAnalysisService()
        logger.info("PersonaReplicationEngine initialized")

    def generate_content_with_persona(self,
                                    user_id: int,
                                    platform: str,
                                    content_request: str,
                                    content_type: str = "post") -> Dict[str, Any]:
        """
        Generate content using the hardened persona replication system.

        Args:
            user_id: User ID for persona lookup
            platform: Target platform (twitter, linkedin, blog, etc.)
            content_request: What content to generate
            content_type: Type of content (post, article, thread, etc.)

        Returns:
            Generated content with persona fidelity metrics
        """
        try:
            logger.info(f"Generating {content_type} for {platform} using persona replication")

            # Get platform-specific persona
            persona_data = self.persona_service.get_persona_for_platform(user_id, platform)

            if not persona_data:
                return {"error": "No persona found for user and platform"}

            # Build hardened system prompt
            system_prompt = self._build_hardened_system_prompt(persona_data, platform)

            # Build content generation prompt
            content_prompt = self._build_content_prompt(content_request, content_type, platform, persona_data)

            # Generate content with strict persona constraints
            content_result = self._generate_constrained_content(
                system_prompt, content_prompt, platform, persona_data
            )

            if "error" in content_result:
                return content_result

            # Validate content against persona
            validation_result = self._validate_content_fidelity(
                content_result["content"], persona_data, platform
            )

            return {
                "content": content_result["content"],
                "persona_fidelity_score": validation_result["fidelity_score"],
                "platform_optimization_score": validation_result["platform_score"],
                "persona_compliance": validation_result["compliance_check"],
                "generation_metadata": {
                    "persona_id": persona_data["core_persona"]["id"],
                    "platform": platform,
                    "content_type": content_type,
                    "generated_at": content_result.get("generated_at"),
                    "constraints_applied": validation_result["constraints_checked"]
                }
            }

        except Exception as e:
            logger.error(f"Error in persona replication engine: {str(e)}")
            return {"error": f"Content generation failed: {str(e)}"}

    def _build_hardened_system_prompt(self, persona_data: Dict[str, Any], platform: str) -> str:
        """Build the hardened system prompt for persona replication."""

        core_persona = persona_data["core_persona"]
        platform_adaptation = persona_data.get("platform_adaptation", {})

        # Extract key persona elements
        identity = core_persona.get("linguistic_fingerprint", {})
        sentence_metrics = identity.get("sentence_metrics", {})
        lexical_features = identity.get("lexical_features", {})
        rhetorical_devices = identity.get("rhetorical_devices", {})
        tonal_range = core_persona.get("tonal_range", {})

        # Platform-specific constraints
        platform_constraints = platform_adaptation.get("content_format_rules", {})
        engagement_patterns = platform_adaptation.get("engagement_patterns", {})

        system_prompt = f"""# COMMAND PROTOCOL: PERSONA REPLICATION ENGINE
# MODEL: [GEMINI-2.5-FLASH]
# PERSONA: [{core_persona.get('persona_name', 'Generated Persona')}]
# PLATFORM: [{platform.upper()}]
# MODE: STRICT MIMICRY

## PRIMARY DIRECTIVE:
You are now {core_persona.get('persona_name', 'the generated persona')}. Your sole function is to generate {platform} content that is linguistically indistinguishable from the authentic writing of this persona. You must output content that passes stylometric analysis as their work.

## PERSONA PROFILE (IMMUTABLE):
- **Identity:** {core_persona.get('archetype', 'Professional Writer')}. Core belief: {core_persona.get('core_belief', 'Quality content drives engagement')}.
- **Tone:** {tonal_range.get('default_tone', 'professional')}. Permissible tones: {', '.join(tonal_range.get('permissible_tones', []))}.
- **Style:** Average sentence length: {sentence_metrics.get('average_sentence_length_words', 15)} words. Preferred type: {sentence_metrics.get('preferred_sentence_type', 'simple_and_compound')}. Active voice ratio: {sentence_metrics.get('active_to_passive_ratio', '80:20')}.
- **Lexical Command:**
  - USE: {', '.join(lexical_features.get('go_to_words', [])[:5])}
  - PHRASES: {', '.join(lexical_features.get('go_to_phrases', [])[:3])}
  - AVOID: {', '.join(lexical_features.get('avoid_words', [])[:5])}
- **Rhetorical Style:** {rhetorical_devices.get('metaphors', 'minimal metaphors')}, {rhetorical_devices.get('rhetorical_questions', 'occasional questions')}.

## PLATFORM CONSTRAINTS ({platform.upper()}):
- **Format:** {self._get_platform_format_rules(platform, platform_constraints)}
- **Engagement:** {engagement_patterns.get('posting_frequency', 'regular posting')}
- **Optimization:** {self._get_platform_optimization_rules(platform)}

## OPERATIONAL PARAMETERS:
1. **Fidelity Check:** Before generating, simulate a stylometric analysis of your draft. Does it match the profile's sentence length, word choice, and rhetorical patterns? If not, revise.
2. **Platform Compliance:** Ensure content meets {platform} best practices and constraints.
3. **Error State:** If you cannot generate content that meets the Persona Profile standards, output only: "[PERSONA_VIOLATION: Cannot comply without breaking character parameters]".
4. **Output Format:** Your output must be PURE CONTENT for {platform}. No introductory clauses. No markdown unless platform supports it.

## ACKNOWLEDGEMENT:
You must silently acknowledge this protocol and begin all responses in character. No confirmation is necessary.

// END PROTOCOL"""

        return system_prompt

    def _build_content_prompt(self, content_request: str, content_type: str, platform: str, persona_data: Dict[str, Any]) -> str:
        """Build the content generation prompt."""

        platform_adaptation = persona_data.get("platform_adaptation", {})
        content_format_rules = platform_adaptation.get("content_format_rules", {})

        prompt = f"""Generate a {content_type} for {platform} about: {content_request}

CONTENT REQUIREMENTS:
- Platform: {platform}
- Type: {content_type}
- Topic: {content_request}

PLATFORM SPECIFICATIONS:
- Character/Word Limit: {content_format_rules.get('character_limit', 'No limit')}
- Optimal Length: {content_format_rules.get('optimal_length', 'Platform appropriate')}
- Format Requirements: {content_format_rules.get('paragraph_structure', 'Standard')}

PERSONA COMPLIANCE:
- Must match the established linguistic fingerprint
- Must use the specified lexical features
- Must maintain the defined tonal range
- Must follow platform-specific adaptations

Generate content that is indistinguishable from the original author's work while optimized for {platform} performance."""

        return prompt

    def _generate_constrained_content(self, system_prompt: str, content_prompt: str, platform: str, persona_data: Dict[str, Any]) -> Dict[str, Any]:
        """Generate content with strict persona constraints."""

        # Define content generation schema
        content_schema = {
            "type": "object",
            "properties": {
                "content": {"type": "string"},
                "persona_compliance_check": {
                    "type": "object",
                    "properties": {
                        "sentence_length_check": {"type": "boolean"},
                        "lexical_compliance": {"type": "boolean"},
                        "tonal_compliance": {"type": "boolean"},
                        "platform_optimization": {"type": "boolean"}
                    }
                },
                "platform_specific_elements": {
                    "type": "object",
                    "properties": {
                        "hashtags": {"type": "array", "items": {"type": "string"}},
                        "mentions": {"type": "array", "items": {"type": "string"}},
                        "call_to_action": {"type": "string"},
                        "engagement_hooks": {"type": "array", "items": {"type": "string"}}
                    }
                },
                "confidence_score": {"type": "number"}
            },
            "required": ["content", "persona_compliance_check", "confidence_score"]
        }

        try:
            response = gemini_structured_json_response(
                prompt=content_prompt,
                schema=content_schema,
                temperature=0.1,  # Very low temperature for consistent persona replication
                max_tokens=4096,
                system_prompt=system_prompt
            )

            if "error" in response:
                return {"error": f"Content generation failed: {response['error']}"}

            response["generated_at"] = logger.info("Content generated with persona constraints")
            return response

        except Exception as e:
            logger.error(f"Error generating constrained content: {str(e)}")
            return {"error": f"Content generation error: {str(e)}"}

    def _validate_content_fidelity(self, content: str, persona_data: Dict[str, Any], platform: str) -> Dict[str, Any]:
        """Validate generated content against persona constraints."""

        try:
            # Basic validation metrics
            validation_result = {
                "fidelity_score": 0.0,
                "platform_score": 0.0,
                "compliance_check": {},
                "constraints_checked": []
            }

            core_persona = persona_data["core_persona"]
            platform_adaptation = persona_data.get("platform_adaptation", {})

            # Check sentence length compliance
            sentences = content.split('.')
            avg_length = sum(len(s.split()) for s in sentences if s.strip()) / max(len([s for s in sentences if s.strip()]), 1)

            target_length = core_persona.get("linguistic_fingerprint", {}).get("sentence_metrics", {}).get("average_sentence_length_words", 15)
            length_compliance = abs(avg_length - target_length) <= 5  # Allow 5-word variance

            validation_result["compliance_check"]["sentence_length"] = length_compliance
            validation_result["constraints_checked"].append("sentence_length")

            # Check lexical compliance
            lexical_features = core_persona.get("linguistic_fingerprint", {}).get("lexical_features", {})
            go_to_words = lexical_features.get("go_to_words", [])
            avoid_words = lexical_features.get("avoid_words", [])

            content_lower = content.lower()
            uses_go_to_words = any(word.lower() in content_lower for word in go_to_words[:3])
            avoids_bad_words = not any(word.lower() in content_lower for word in avoid_words)

            lexical_compliance = uses_go_to_words and avoids_bad_words
            validation_result["compliance_check"]["lexical_features"] = lexical_compliance
            validation_result["constraints_checked"].append("lexical_features")

            # Check platform constraints
            platform_constraints = platform_adaptation.get("content_format_rules", {})
            char_limit = platform_constraints.get("character_limit")

            platform_compliance = True
            if char_limit and len(content) > char_limit:
                platform_compliance = False

            validation_result["compliance_check"]["platform_constraints"] = platform_compliance
            validation_result["constraints_checked"].append("platform_constraints")

            # Calculate overall scores
            compliance_checks = validation_result["compliance_check"]
            fidelity_score = sum(compliance_checks.values()) / len(compliance_checks) * 100
            platform_score = 100 if platform_compliance else 50  # Heavy penalty for platform violations

            validation_result["fidelity_score"] = fidelity_score
            validation_result["platform_score"] = platform_score

            logger.info(f"Content validation: Fidelity={fidelity_score}%, Platform={platform_score}%")
            return validation_result

        except Exception as e:
            logger.error(f"Error validating content fidelity: {str(e)}")
            return {
                "fidelity_score": 0.0,
                "platform_score": 0.0,
                "compliance_check": {"error": str(e)},
                "constraints_checked": []
            }

    def _get_platform_format_rules(self, platform: str, constraints: Dict[str, Any]) -> str:
        """Get formatted platform rules for system prompt."""

        char_limit = constraints.get("character_limit", "No limit")
        optimal_length = constraints.get("optimal_length", "Platform appropriate")

        return f"Character limit: {char_limit}, Optimal length: {optimal_length}"

    def _get_platform_optimization_rules(self, platform: str) -> str:
        """Get platform optimization rules."""

        rules = {
            "twitter": "Use hashtags strategically (max 3), engage with questions, optimize for retweets",
            "linkedin": "Professional tone, thought leadership focus, encourage professional discussion",
            "instagram": "Visual-first approach, emoji usage, story-friendly format",
            "facebook": "Community engagement, shareable content, algorithm-friendly",
            "blog": "SEO-optimized, scannable format, internal linking",
            "medium": "Storytelling focus, publication-ready, clap optimization",
            "substack": "Newsletter format, subscriber value, email-friendly"
        }

        return rules.get(platform, "Platform-appropriate optimization")

    def create_hardened_persona_prompt(self, persona_data: Dict[str, Any], platform: str) -> str:
        """
        Create the hardened persona prompt for direct use in AI interfaces.
        This is the fire-and-forget prompt that can be copied into any AI system.
        """

        core_persona = persona_data["core_persona"]
        platform_adaptation = persona_data.get("platform_adaptation", {})

        # Extract quantitative data
        linguistic = core_persona.get("linguistic_fingerprint", {})
        sentence_metrics = linguistic.get("sentence_metrics", {})
        lexical_features = linguistic.get("lexical_features", {})
        rhetorical_devices = linguistic.get("rhetorical_devices", {})
        tonal_range = core_persona.get("tonal_range", {})

        hardened_prompt = f"""# COMMAND PROTOCOL: PERSONA REPLICATION ENGINE
# MODEL: [AI-MODEL]
# PERSONA: [{core_persona.get('persona_name', 'Generated Persona')}]
# PLATFORM: [{platform.upper()}]
# MODE: STRICT MIMICRY

## PRIMARY DIRECTIVE:
You are now {core_persona.get('persona_name', 'the persona')}. Your sole function is to generate {platform} content that is linguistically indistinguishable from the authentic writing of this persona. You must output content that passes stylometric analysis as their work.

## PERSONA PROFILE (IMMUTABLE):
- **Identity:** {core_persona.get('archetype', 'Professional Writer')}. Core belief: {core_persona.get('core_belief', 'Quality content drives engagement')}.
- **Tone:** {tonal_range.get('default_tone', 'professional')}. {f"Permissible: {', '.join(tonal_range.get('permissible_tones', []))}" if tonal_range.get('permissible_tones') else ''}. {f"Forbidden: {', '.join(tonal_range.get('forbidden_tones', []))}" if tonal_range.get('forbidden_tones') else ''}.
- **Style:** Avg sentence: {sentence_metrics.get('average_sentence_length_words', 15)} words. Type: {sentence_metrics.get('preferred_sentence_type', 'simple_and_compound')}. Active voice: {sentence_metrics.get('active_to_passive_ratio', '80:20')}.
- **Lexical Command:**
  - USE: {', '.join(lexical_features.get('go_to_words', [])[:5]) if lexical_features.get('go_to_words') else 'professional vocabulary'}
  - PHRASES: {', '.join(lexical_features.get('go_to_phrases', [])[:3]) if lexical_features.get('go_to_phrases') else 'natural transitions'}
  - AVOID: {', '.join(lexical_features.get('avoid_words', [])[:5]) if lexical_features.get('avoid_words') else 'corporate jargon'}
- **Rhetorical Style:** {rhetorical_devices.get('metaphors', 'minimal metaphors')}, {rhetorical_devices.get('rhetorical_questions', 'occasional questions')}.

## PLATFORM CONSTRAINTS ({platform.upper()}):
{self._format_platform_constraints(platform, platform_adaptation)}

## OPERATIONAL PARAMETERS:
1. **Fidelity Check:** Before generating, verify your draft matches the profile's sentence length ({sentence_metrics.get('average_sentence_length_words', 15)} words avg), word choice, and rhetorical patterns. If not, revise.
2. **Platform Compliance:** Ensure content meets {platform} format requirements and optimization rules.
3. **Error State:** If you cannot generate content meeting Persona Profile standards, output: "[PERSONA_VIOLATION: Cannot comply without breaking character parameters]".
4. **Output Format:** Generate PURE {platform.upper()} CONTENT. No introductory text. No explanations. Only the requested content.

## ACKNOWLEDGEMENT:
You must silently acknowledge this protocol and begin all responses in character. No confirmation necessary.

// END PROTOCOL

---

## USAGE INSTRUCTIONS:
1. Copy this entire prompt into your AI system's System Message/Instructions field
2. Use normal user prompts to request content (e.g., "Write a post about AI trends")
3. The AI will generate content that matches the persona's style exactly
4. No additional prompting or style instructions needed

## QUALITY ASSURANCE:
- Generated content should pass stylometric analysis as the original author
- Sentence length should average {sentence_metrics.get('average_sentence_length_words', 15)} words
- Must use specified vocabulary and avoid forbidden words
- Must maintain {tonal_range.get('default_tone', 'professional')} tone throughout
- Must comply with {platform} format and engagement requirements"""

        return hardened_prompt

    def _format_platform_constraints(self, platform: str, platform_adaptation: Dict[str, Any]) -> str:
        """Format platform constraints for the hardened prompt."""

        content_rules = platform_adaptation.get("content_format_rules", {})
        engagement = platform_adaptation.get("engagement_patterns", {})

        constraints = []

        if content_rules.get("character_limit"):
            constraints.append(f"Character limit: {content_rules['character_limit']}")

        if content_rules.get("optimal_length"):
            constraints.append(f"Optimal length: {content_rules['optimal_length']}")

        if engagement.get("posting_frequency"):
            constraints.append(f"Frequency: {engagement['posting_frequency']}")

        if platform == "twitter":
            constraints.extend([
                "Max 3 hashtags",
                "Thread-friendly format",
                "Engagement-optimized"
            ])
        elif platform == "linkedin":
            constraints.extend([
                "Professional networking focus",
                "Thought leadership tone",
                "Business value emphasis"
            ])
        elif platform == "blog":
            constraints.extend([
                "SEO-optimized structure",
                "Scannable format",
                "Clear headings"
            ])

        return "- " + "\n- ".join(constraints) if constraints else "- Standard platform optimization"

    def export_persona_for_external_use(self, user_id: int, platform: str) -> Dict[str, Any]:
        """
        Export a complete persona package for use in external AI systems.
        This creates a self-contained persona replication system.
        """
        try:
            # Get persona data
            persona_data = self.persona_service.get_persona_for_platform(user_id, platform)

            if not persona_data:
                return {"error": "No persona found"}

            # Create hardened prompt
            hardened_prompt = self.create_hardened_persona_prompt(persona_data, platform)

            # Create usage examples
            examples = self._generate_usage_examples(persona_data, platform)

            # Create validation checklist
            validation_checklist = self._create_validation_checklist(persona_data, platform)

            export_package = {
                "persona_metadata": {
                    "persona_id": persona_data["core_persona"]["id"],
                    "persona_name": persona_data["core_persona"]["persona_name"],
                    "platform": platform,
                    "generated_at": datetime.utcnow().isoformat(),
                    "confidence_score": persona_data["core_persona"].get("confidence_score", 0.0)
                },
                "hardened_system_prompt": hardened_prompt,
                "usage_examples": examples,
                "validation_checklist": validation_checklist,
                "quick_reference": {
                    "avg_sentence_length": persona_data["core_persona"].get("linguistic_fingerprint", {}).get("sentence_metrics", {}).get("average_sentence_length_words", 15),
                    "go_to_words": persona_data["core_persona"].get("linguistic_fingerprint", {}).get("lexical_features", {}).get("go_to_words", [])[:5],
                    "default_tone": persona_data["core_persona"].get("tonal_range", {}).get("default_tone", "professional"),
                    "platform_limit": persona_data.get("platform_adaptation", {}).get("content_format_rules", {}).get("character_limit", "No limit")
                }
            }

            logger.info(f"✅ Persona export package created for {platform}")
            return export_package

        except Exception as e:
            logger.error(f"Error exporting persona: {str(e)}")
            return {"error": f"Export failed: {str(e)}"}

    def _generate_usage_examples(self, persona_data: Dict[str, Any], platform: str) -> List[Dict[str, Any]]:
        """Generate usage examples for the exported persona."""

        examples = [
            {
                "request": f"Write a {platform} post about AI trends",
                "expected_style": "Should match persona's sentence length and lexical features",
                "validation_points": [
                    "Check average sentence length",
                    "Verify use of go-to words",
                    "Confirm tonal compliance",
                    f"Ensure {platform} optimization"
                ]
            },
            {
                "request": f"Create {platform} content about productivity tips",
                "expected_style": "Should maintain consistent voice and rhetorical patterns",
                "validation_points": [
                    "Verify rhetorical device usage",
                    "Check for forbidden words",
                    "Confirm platform constraints",
                    "Validate engagement elements"
                ]
            }
        ]

        return examples

    def _create_validation_checklist(self, persona_data: Dict[str, Any], platform: str) -> List[str]:
        """Create a validation checklist for generated content."""

        core_persona = persona_data["core_persona"]
        linguistic = core_persona.get("linguistic_fingerprint", {})

        checklist = [
            f"✓ Average sentence length ~{linguistic.get('sentence_metrics', {}).get('average_sentence_length_words', 15)} words",
            f"✓ Uses go-to words: {', '.join(linguistic.get('lexical_features', {}).get('go_to_words', [])[:3])}",
            f"✓ Avoids forbidden words: {', '.join(linguistic.get('lexical_features', {}).get('avoid_words', [])[:3])}",
            f"✓ Maintains {core_persona.get('tonal_range', {}).get('default_tone', 'professional')} tone",
            f"✓ Follows {platform} format requirements",
            f"✓ Includes appropriate {platform} engagement elements"
        ]

        return checklist