Scheduled research persona generation

2025-11-05 08:51:00 +05:30
parent 55087c4f37
commit d99c7c83a7
98 changed files with 14518 additions and 828 deletions
--- a/backend/services/research/research_persona_prompt_builder.py
+++ b/backend/services/research/research_persona_prompt_builder.py
@@ -0,0 +1,171 @@
+"""
+Research Persona Prompt Builder
+
+Handles building comprehensive prompts for research persona generation.
+Generates personalized research defaults, suggestions, and configurations.
+"""
+
+from typing import Dict, Any
+import json
+from loguru import logger
+
+
+class ResearchPersonaPromptBuilder:
+    """Builds comprehensive prompts for research persona generation."""
+    
+    def build_research_persona_prompt(self, onboarding_data: Dict[str, Any]) -> str:
+        """Build the research persona generation prompt with comprehensive data."""
+        
+        # Extract data from onboarding_data
+        website_analysis = onboarding_data.get("website_analysis", {}) or {}
+        persona_data = onboarding_data.get("persona_data", {}) or {}
+        research_prefs = onboarding_data.get("research_preferences", {}) or {}
+        business_info = onboarding_data.get("business_info", {}) or {}
+        
+        # Extract core persona
+        core_persona = persona_data.get("core_persona", {}) or {}
+        
+        prompt = f"""
+COMPREHENSIVE RESEARCH PERSONA GENERATION TASK: Create a highly detailed, personalized research persona based on the user's business, writing style, and content strategy. This persona will provide intelligent defaults and suggestions for research inputs.
+
+=== USER CONTEXT ===
+
+BUSINESS INFORMATION:
+{json.dumps(business_info, indent=2)}
+
+WEBSITE ANALYSIS:
+{json.dumps(website_analysis, indent=2)}
+
+CORE PERSONA:
+{json.dumps(core_persona, indent=2)}
+
+RESEARCH PREFERENCES:
+{json.dumps(research_prefs, indent=2)}
+
+=== RESEARCH PERSONA GENERATION REQUIREMENTS ===
+
+Generate a comprehensive research persona in JSON format with the following structure:
+
+1. DEFAULT VALUES:
+   - "default_industry": Extract from core_persona.industry, business_info.industry, or website_analysis target_audience. Use "General" only if none available.
+   - "default_target_audience": Extract from core_persona.target_audience, website_analysis.target_audience, or business_info.target_audience. Be specific and descriptive.
+   - "default_research_mode": Suggest "basic", "comprehensive", or "targeted" based on research_preferences.research_depth and content_type preferences.
+   - "default_provider": Suggest "google" for news/trends, "exa" for academic/technical deep-dives, or "google" as default.
+
+2. KEYWORD INTELLIGENCE:
+   - "suggested_keywords": Generate 8-12 keywords relevant to the user's industry, interests (from core_persona), and content goals.
+   - "keyword_expansion_patterns": Create a dictionary mapping common keywords to expanded, industry-specific terms. Include 10-15 patterns like:
+     {{"AI": ["healthcare AI", "medical AI", "clinical AI", "diagnostic AI"], "tools": ["medical devices", "clinical tools"], ...}}
+     Focus on industry-specific terminology from the user's domain.
+
+3. DOMAIN EXPERTISE:
+   - "suggested_exa_domains": List 4-6 authoritative domains for the user's industry (e.g., Healthcare: ["pubmed.gov", "nejm.org", "thelancet.com"]).
+   - "suggested_exa_category": Suggest appropriate Exa category based on industry:
+     - Healthcare/Science: "research paper"
+     - Finance: "financial report"
+     - Technology/Business: "company" or "news"
+     - Default: null (empty string for all categories)
+
+4. RESEARCH ANGLES:
+   - "research_angles": Generate 5-8 alternative research angles/focuses based on:
+     - User's pain points and challenges (from core_persona)
+     - Industry trends and opportunities
+     - Content goals (from research_preferences)
+     - Audience interests (from core_persona.interests)
+   Examples: "Compare {{topic}} tools", "{{topic}} ROI analysis", "Latest {{topic}} trends", etc.
+
+5. QUERY ENHANCEMENT:
+   - "query_enhancement_rules": Create templates for improving vague user queries:
+     {{"vague_ai": "Research: AI applications in {{industry}} for {{audience}}", "vague_tools": "Compare top {{industry}} tools", ...}}
+     Include 5-8 enhancement patterns.
+
+6. RECOMMENDED PRESETS:
+   - "recommended_presets": Generate 3-5 personalized research preset templates. Each preset should include:
+     - name: Descriptive name (e.g., "{{Industry}} Trends", "{{Audience}} Insights")
+     - keywords: Research query string
+     - industry: User's industry
+     - target_audience: User's target audience
+     - research_mode: "basic", "comprehensive", or "targeted"
+     - config: Complete ResearchConfig object with appropriate settings
+     - description: Brief explanation of what this preset researches
+     Make presets relevant to the user's specific industry, audience, and content goals.
+
+7. RESEARCH PREFERENCES:
+   - "research_preferences": Extract and structure research preferences from onboarding:
+     - research_depth: From research_preferences.research_depth
+     - content_types: From research_preferences.content_types
+     - auto_research: From research_preferences.auto_research
+     - factual_content: From research_preferences.factual_content
+
+=== OUTPUT REQUIREMENTS ===
+
+Return a valid JSON object matching this exact structure:
+{{
+  "default_industry": "string",
+  "default_target_audience": "string",
+  "default_research_mode": "basic" | "comprehensive" | "targeted",
+  "default_provider": "google" | "exa",
+  "suggested_keywords": ["keyword1", "keyword2", ...],
+  "keyword_expansion_patterns": {{
+    "keyword": ["expansion1", "expansion2", ...]
+  }},
+  "suggested_exa_domains": ["domain1.com", "domain2.com", ...],
+  "suggested_exa_category": "string or null",
+  "research_angles": ["angle1", "angle2", ...],
+  "query_enhancement_rules": {{
+    "pattern": "template"
+  }},
+  "recommended_presets": [
+    {{
+      "name": "string",
+      "keywords": "string",
+      "industry": "string",
+      "target_audience": "string",
+      "research_mode": "basic" | "comprehensive" | "targeted",
+      "config": {{
+        "mode": "basic" | "comprehensive" | "targeted",
+        "provider": "google" | "exa",
+        "max_sources": 10 | 15 | 12,
+        "include_statistics": true | false,
+        "include_expert_quotes": true | false,
+        "include_competitors": true | false,
+        "include_trends": true | false,
+        "exa_category": "string or null",
+        "exa_include_domains": ["domain1.com", ...],
+        "exa_search_type": "auto" | "keyword" | "neural"
+      }},
+      "description": "string"
+    }}
+  ],
+  "research_preferences": {{
+    "research_depth": "string",
+    "content_types": ["type1", "type2", ...],
+    "auto_research": true | false,
+    "factual_content": true | false
+  }},
+  "version": "1.0",
+  "confidence_score": 85.0
+}}
+
+=== IMPORTANT INSTRUCTIONS ===
+
+1. Be highly specific and personalized - use actual data from the user's business, persona, and preferences.
+2. Avoid generic suggestions - every field should reflect the user's unique context.
+3. For industries not clearly identified, infer from website_analysis.content_characteristics or writing_style.
+4. Ensure all suggested keywords, domains, and angles are relevant to the user's industry and audience.
+5. Generate realistic, actionable presets that the user would actually want to use.
+6. Confidence score should reflect data richness (0-100): higher if rich onboarding data, lower if minimal data.
+7. Return ONLY valid JSON - no markdown formatting, no explanatory text.
+
+Generate the research persona now:
+"""
+        
+        return prompt
+    
+    def get_json_schema(self) -> Dict[str, Any]:
+        """Return JSON schema for structured LLM response."""
+        # This will be used with llm_text_gen(json_struct=...)
+        from models.research_persona_models import ResearchPersona, ResearchPreset
+        
+        # Convert Pydantic model to JSON schema
+        return ResearchPersona.schema()