""" Intent Prompt Builder Builds comprehensive AI prompts for: 1. Intent inference from user input 2. Targeted query generation 3. Intent-aware result analysis Author: ALwrity Team Version: 1.0 """ import json from datetime import datetime from typing import Dict, Any, List, Optional from loguru import logger from models.research_intent_models import ( ResearchIntent, ResearchPurpose, ContentOutput, ExpectedDeliverable, ResearchDepthLevel, ) from models.research_persona_models import ResearchPersona class IntentPromptBuilder: """Builds prompts for intent-driven research.""" def _get_current_date_context(self) -> str: """Get current date/time context for prompts.""" now = datetime.now() current_year = now.year current_month = now.strftime("%B") # Full month name current_date = now.strftime("%Y-%m-%d") return f"CURRENT DATE: {current_date} ({current_month} {current_year})\nCURRENT YEAR: {current_year}" # Purpose explanations for the AI PURPOSE_EXPLANATIONS = { ResearchPurpose.LEARN: "User wants to understand a topic for personal knowledge", ResearchPurpose.CREATE_CONTENT: "User will create content (blog, video, podcast) from this research", ResearchPurpose.MAKE_DECISION: "User needs to make a choice/decision based on research", ResearchPurpose.COMPARE: "User wants to compare alternatives or competitors", ResearchPurpose.SOLVE_PROBLEM: "User is looking for a solution to a specific problem", ResearchPurpose.FIND_DATA: "User needs specific statistics, facts, or citations", ResearchPurpose.EXPLORE_TRENDS: "User wants to understand current/future trends", ResearchPurpose.VALIDATE: "User wants to verify or fact-check information", ResearchPurpose.GENERATE_IDEAS: "User wants to brainstorm content ideas", } # Deliverable descriptions DELIVERABLE_DESCRIPTIONS = { ExpectedDeliverable.KEY_STATISTICS: "Numbers, percentages, data points with citations", ExpectedDeliverable.EXPERT_QUOTES: "Authoritative quotes from industry experts", ExpectedDeliverable.CASE_STUDIES: "Real examples and success stories", ExpectedDeliverable.COMPARISONS: "Side-by-side analysis tables", ExpectedDeliverable.TRENDS: "Current and emerging industry trends", ExpectedDeliverable.BEST_PRACTICES: "Recommended approaches and guidelines", ExpectedDeliverable.STEP_BY_STEP: "Process guides and how-to instructions", ExpectedDeliverable.PROS_CONS: "Advantages and disadvantages analysis", ExpectedDeliverable.DEFINITIONS: "Clear explanations of concepts and terms", ExpectedDeliverable.CITATIONS: "Authoritative sources for reference", ExpectedDeliverable.EXAMPLES: "Concrete examples to illustrate points", ExpectedDeliverable.PREDICTIONS: "Future outlook and predictions", } def build_intent_inference_prompt( self, user_input: str, keywords: List[str], research_persona: Optional[ResearchPersona] = None, competitor_data: Optional[List[Dict]] = None, industry: Optional[str] = None, target_audience: Optional[str] = None, ) -> str: """ Build prompt for inferring user's research intent. This prompt analyzes the user's input and determines: - What they want to accomplish - What questions they need answered - What specific deliverables they need """ # Get current date context date_context = self._get_current_date_context() now = datetime.now() current_year = now.year # Build persona context persona_context = self._build_persona_context(research_persona, industry, target_audience) # Build competitor context competitor_context = self._build_competitor_context(competitor_data) prompt = f"""You are an expert research intent analyzer. Your job is to understand what a content creator REALLY needs from their research. ## CURRENT DATE/TIME CONTEXT {date_context} **NOTE**: When user mentions time-sensitive terms (latest, current, recent, trends, predictions), prioritize {current_year} data. ## USER INPUT "{user_input}" {f"KEYWORDS: {', '.join(keywords)}" if keywords else ""} ## USER CONTEXT {persona_context} {competitor_context} ## YOUR TASK Analyze the user's input and infer their research intent. Determine: 1. **INPUT TYPE**: Is this: - "keywords": Simple topic keywords (e.g., "AI healthcare {current_year}") - "question": A specific question (e.g., "What are the best AI tools for healthcare?") - "goal": A goal statement (e.g., "I need to write a blog about AI in healthcare") - "mixed": Combination of above 2. **PRIMARY QUESTION**: What is the main question to answer? Convert their input into a clear question. 3. **SECONDARY QUESTIONS**: What related questions should also be answered? (3-5 questions) 4. **PURPOSE**: Why are they researching? Choose ONE: - "learn": Understand a topic for personal knowledge - "create_content": Create content (blog, video, podcast) - "make_decision": Make a choice between options - "compare": Compare alternatives/competitors - "solve_problem": Find a solution - "find_data": Get specific statistics/facts - "explore_trends": Understand industry trends - "validate": Verify claims/information - "generate_ideas": Brainstorm ideas 5. **CONTENT OUTPUT**: What will they create? Choose ONE: - "blog", "podcast", "video", "social_post", "newsletter", "presentation", "report", "whitepaper", "email", "general" 6. **EXPECTED DELIVERABLES**: What specific outputs do they need? Choose ALL that apply: - "key_statistics": Numbers, data points - "expert_quotes": Authoritative quotes - "case_studies": Real examples - "comparisons": Side-by-side analysis - "trends": Industry trends - "best_practices": Recommendations - "step_by_step": How-to guides - "pros_cons": Advantages/disadvantages - "definitions": Concept explanations - "citations": Source references - "examples": Concrete examples - "predictions": Future outlook 7. **DEPTH**: How deep should the research go? - "overview": Quick summary - "detailed": In-depth analysis - "expert": Comprehensive expert-level 8. **FOCUS AREAS**: What specific aspects should be researched? (2-4 areas) 9. **PERSPECTIVE**: From whose viewpoint? (e.g., "marketing manager", "small business owner") 10. **TIME SENSITIVITY**: Is recency important? - "real_time": Latest only (past 24-48 hours) - "recent": Past week/month - "historical": Include older content - "evergreen": Timeless content 11. **CONFIDENCE**: How confident are you in this inference? (0.0-1.0) - If < 0.7, set needs_clarification to true and provide clarifying_questions - Provide a brief reason for your confidence level - If confidence is low, provide an example of what a great input would look like ## OUTPUT FORMAT Return a JSON object: ```json {{ "input_type": "keywords|question|goal|mixed", "primary_question": "The main question to answer", "secondary_questions": ["question 1", "question 2", "question 3"], "purpose": "one of the purpose options", "content_output": "one of the content options", "expected_deliverables": ["deliverable1", "deliverable2"], "depth": "overview|detailed|expert", "focus_areas": ["area1", "area2"], "perspective": "target perspective or null", "time_sensitivity": "real_time|recent|historical|evergreen", "confidence": 0.85, "confidence_reason": "Brief explanation of why this confidence level (e.g., 'User provided clear keywords and context' or 'Input is vague, missing specific goals')", "great_example": "Example of what a great input would look like for this research (only if confidence < 0.8)", "needs_clarification": false, "clarifying_questions": [], "analysis_summary": "Brief summary of what the user wants" }} ``` ## IMPORTANT RULES 1. Always convert vague input into a specific primary question 2. Infer deliverables based on purpose (e.g., create_content → statistics + examples) 3. Use persona context to refine perspective and focus areas 4. If input is ambiguous, provide clarifying questions 5. Default to "detailed" depth unless input suggests otherwise 6. For content creation, include relevant deliverables automatically """ return prompt def build_query_generation_prompt( self, intent: ResearchIntent, research_persona: Optional[ResearchPersona] = None, ) -> str: """ Build prompt for generating targeted research queries. Generates multiple queries, each targeting a specific deliverable. """ deliverables_list = "\n".join([ f"- {d}: {self.DELIVERABLE_DESCRIPTIONS.get(ExpectedDeliverable(d), d)}" for d in intent.expected_deliverables ]) persona_keywords = "" if research_persona and research_persona.suggested_keywords: persona_keywords = f"\nSUGGESTED KEYWORDS FROM PERSONA: {', '.join(research_persona.suggested_keywords[:10])}" # Get current date context date_context = self._get_current_date_context() now = datetime.now() current_year = now.year next_year = current_year + 1 current_month_year = now.strftime("%B %Y") prompt = f"""You are a research query optimizer. Generate multiple targeted search queries based on the user's research intent. ## CURRENT DATE/TIME CONTEXT {date_context} **CRITICAL**: When generating queries: - ALWAYS use the CURRENT YEAR ({current_year}) for time-sensitive queries - For trends, predictions, or future-looking queries, use {current_year} or {next_year} - For recent/real-time queries, use current month/year: {current_month_year} - NEVER use outdated years from training data (e.g., 2024, 2025 if we're past those dates) - When user mentions "latest", "current", "recent", or time-sensitive terms, prioritize {current_year} data ## RESEARCH INTENT PRIMARY QUESTION: {intent.primary_question} SECONDARY QUESTIONS: {chr(10).join(f'- {q}' for q in intent.secondary_questions) if intent.secondary_questions else 'None'} PURPOSE: {intent.purpose} - {self.PURPOSE_EXPLANATIONS.get(ResearchPurpose(intent.purpose), intent.purpose)} CONTENT OUTPUT: {intent.content_output} EXPECTED DELIVERABLES: {deliverables_list} DEPTH: {intent.depth} FOCUS AREAS: {', '.join(intent.focus_areas) if intent.focus_areas else 'General'} PERSPECTIVE: {intent.perspective or 'General audience'} TIME SENSITIVITY: {intent.time_sensitivity or 'No specific requirement'} {persona_keywords} ## YOUR TASK Generate 4-8 targeted research queries. Each query should: 1. Target a specific deliverable or question 2. Be optimized for semantic search (Exa/Tavily) 3. Include relevant context for better results For each query, specify: - The query string - What deliverable it targets - Best provider (exa for semantic/deep, tavily for news/real-time, google for factual) - Priority (1-5, higher = more important) - What we expect to find ## OUTPUT FORMAT Return a JSON object: ```json {{ "queries": [ {{ "query": "Healthcare AI adoption statistics {current_year} hospitals implementation data", "purpose": "key_statistics", "provider": "exa", "priority": 5, "expected_results": "Statistics on hospital AI adoption rates" }}, {{ "query": "AI healthcare trends predictions future outlook {current_year} {next_year}", "purpose": "trends", "provider": "tavily", "priority": 4, "expected_results": "Current trends and future predictions in healthcare AI" }} ], "enhanced_keywords": ["keyword1", "keyword2", "keyword3"], "research_angles": [ "Angle 1: Focus on adoption challenges", "Angle 2: Focus on ROI and outcomes" ] }} ``` ## QUERY OPTIMIZATION RULES 1. For STATISTICS: Include words like "statistics", "data", "percentage", "report", "study", and CURRENT YEAR ({current_year}) 2. For CASE STUDIES: Include "case study", "success story", "implementation", "example" 3. For TRENDS: Include "trends", "future", "predictions", "emerging", and CURRENT YEAR ({current_year}) or {next_year} 4. For EXPERT QUOTES: Include expert names if known, or "expert opinion", "interview" 5. For COMPARISONS: Include "vs", "compare", "comparison", "alternative" 6. For NEWS/REAL-TIME: Use Tavily, include CURRENT YEAR ({current_year}) and current month/year ({current_month_year}) 7. For ACADEMIC/DEEP: Use Exa with neural search 8. **CRITICAL**: Always use {current_year} (not outdated years) for time-sensitive queries """ return prompt def build_intent_aware_analysis_prompt( self, raw_results: str, intent: ResearchIntent, research_persona: Optional[ResearchPersona] = None, ) -> str: """ Build prompt for analyzing research results based on user intent. This is the key prompt that extracts exactly what the user needs. """ purpose_explanation = self.PURPOSE_EXPLANATIONS.get( ResearchPurpose(intent.purpose), intent.purpose ) deliverables_instructions = self._build_deliverables_instructions(intent.expected_deliverables) perspective_instruction = "" if intent.perspective: perspective_instruction = f"\n**PERSPECTIVE**: Analyze results from the viewpoint of: {intent.perspective}" # Get current date context date_context = self._get_current_date_context() now = datetime.now() current_year = now.year prompt = f"""You are a research analyst helping a content creator find exactly what they need. Your job is to analyze raw research results and extract precisely what the user is looking for. ## CURRENT DATE/TIME CONTEXT {date_context} **CRITICAL**: When analyzing results: - Prioritize data from CURRENT YEAR ({current_year}) or recent dates - If statistics/quotes mention outdated years, note the recency in context - For trends/predictions, ensure timelines reference {current_year} or future years - NEVER present outdated data as "current" or "latest" - always check dates ## USER'S RESEARCH INTENT **PRIMARY QUESTION**: {intent.primary_question} **SECONDARY QUESTIONS TO ANSWER**: {chr(10).join(f'- {q}' for q in intent.secondary_questions) if intent.secondary_questions else 'None specified'} **FOCUS AREAS** (prioritize information related to these): {', '.join(intent.focus_areas) if intent.focus_areas else 'General - no specific focus areas'} **ALSO ANSWERING** (address these topics if found in results): {', '.join(intent.also_answering) if intent.also_answering else 'None specified'} **PURPOSE**: {intent.purpose} → {purpose_explanation} **CONTENT OUTPUT**: {intent.content_output} **EXPECTED DELIVERABLES**: {', '.join(intent.expected_deliverables)} **PERSPECTIVE**: {intent.perspective or 'General audience'} {perspective_instruction} ## RAW RESEARCH RESULTS {raw_results[:15000]} # Truncated for token limits ## YOUR TASK Analyze the raw research results and extract EXACTLY what the user needs. Use a **generalized approach** - don't over-optimize for specific fields, but ensure all intent aspects are considered naturally. ### ANALYSIS GUIDELINES: 1. **PRIMARY QUESTION**: Always provide a direct, clear answer to the primary question in 2-3 sentences. 2. **SECONDARY QUESTIONS**: For each secondary question, provide an answer if information is available in the results. If not available, note it in gaps_identified. Don't force answers - only include what's actually in the results. 3. **FOCUS AREAS**: When extracting deliverables, prioritize information that relates to the focus areas. If focus areas are specified: - Weight relevance scores higher for sources/content matching focus areas - Include focus area context in extracted statistics, quotes, case studies - If results don't address focus areas, note this in gaps_identified - Provide a brief summary of what was found for each focus area in focus_areas_coverage 4. **ALSO ANSWERING**: If results contain information about "also answering" topics, include it naturally in the analysis. Don't create separate sections unless the information is substantial. Provide a brief summary of what was found for each topic in also_answering_coverage. 5. **GENERALIZED EXTRACTION**: - Extract deliverables based on expected_deliverables - Use perspective to frame information appropriately - Consider content_output when structuring results - Don't over-optimize - let the results guide what's extracted 6. **CONTEXTUAL LINKING**: When extracting information, consider: - How it relates to the primary question - Which secondary questions it answers - Which focus areas it addresses - This helps create a cohesive research result {deliverables_instructions} ## OUTPUT REQUIREMENTS Provide results in this JSON structure: ```json {{ "primary_answer": "Direct 2-3 sentence answer to the primary question", "secondary_answers": {{ "Secondary Question 1?": "Answer if found in results, or null if not available", "Secondary Question 2?": "Answer if found in results, or null if not available" }}, "focus_areas_coverage": {{ "Focus Area 1": "Brief summary of what was found related to this focus area, or null if not covered", "Focus Area 2": "Brief summary of what was found related to this focus area, or null if not covered" }}, "also_answering_coverage": {{ "Topic 1": "Information found about this topic, or null if not found", "Topic 2": "Information found about this topic, or null if not found" }}, "executive_summary": "2-3 sentence executive summary of all findings", "key_takeaways": [ "Key takeaway 1 - most important finding", "Key takeaway 2", "Key takeaway 3", "Key takeaway 4", "Key takeaway 5" ], "statistics": [ {{ "statistic": "72% of hospitals plan to adopt AI by {current_year}", "value": "72%", "context": "Survey of 500 US hospitals in {current_year}", "source": "Healthcare AI Report {current_year}", "url": "https://example.com/report", "credibility": 0.9, "recency": "{current_year}" }} ], "expert_quotes": [ {{ "quote": "AI will revolutionize patient care within 5 years", "speaker": "Dr. Jane Smith", "title": "Chief Medical Officer", "organization": "HealthTech Inc", "source": "TechCrunch", "url": "https://example.com/article" }} ], "case_studies": [ {{ "title": "Mayo Clinic AI Implementation", "organization": "Mayo Clinic", "challenge": "High patient wait times", "solution": "AI-powered triage system", "outcome": "40% reduction in wait times", "key_metrics": ["40% faster triage", "95% patient satisfaction"], "source": "Healthcare IT News", "url": "https://example.com" }} ], "trends": [ {{ "trend": "AI-assisted diagnostics adoption", "direction": "growing", "evidence": ["25% YoY growth", "Major hospital chains investing"], "impact": "Could reduce misdiagnosis by 30%", "timeline": "Expected mainstream by {current_year + 2}", "sources": ["url1", "url2"] }} ], "comparisons": [ {{ "title": "Top AI Healthcare Platforms", "criteria": ["Cost", "Features", "Support"], "items": [ {{ "name": "Platform A", "pros": ["Easy integration", "Good support"], "cons": ["Higher cost"], "features": {{"Cost": "$500/month", "Support": "24/7"}} }} ], "verdict": "Platform A best for large hospitals" }} ], "best_practices": [ "Start with a pilot program before full deployment", "Ensure staff training is comprehensive" ], "step_by_step": [ "Step 1: Assess current infrastructure", "Step 2: Define use cases", "Step 3: Select vendor" ], "pros_cons": {{ "subject": "AI in Healthcare", "pros": ["Improved accuracy", "Cost savings"], "cons": ["Initial investment", "Training required"], "balanced_verdict": "Benefits outweigh costs for most hospitals" }}, "definitions": {{ "Clinical AI": "AI systems designed for medical diagnosis and treatment recommendations" }}, "examples": [ "Example: Hospital X reduced readmissions by 25% using predictive AI" ], "predictions": [ "By {current_year + 5}, AI will assist in 80% of initial diagnoses" ], "suggested_outline": [ "1. Introduction: The AI Healthcare Revolution", "2. Current State: Where We Are Today", "3. Key Statistics and Trends", "4. Case Studies: Success Stories", "5. Implementation Guide", "6. Future Outlook" ], "sources": [ {{ "title": "Healthcare AI Report {current_year}", "url": "https://example.com", "relevance_score": 0.95, "relevance_reason": "Directly addresses adoption statistics", "content_type": "research report", "credibility_score": 0.9 }} ], "confidence": 0.85, "gaps_identified": [ "Specific cost data for small clinics not found", "Limited information on regulatory challenges" ], "follow_up_queries": [ "AI healthcare regulations FDA {current_year}", "Small clinic AI implementation costs" ] }} ``` ## CRITICAL RULES 1. **ONLY include information directly from the raw results** - do not make up data 2. **ALWAYS include source URLs** for every statistic, quote, and case study 3. **If a deliverable type has no relevant data**, return an empty array for it 4. **Prioritize recency and credibility** when multiple sources conflict 5. **Answer the PRIMARY QUESTION directly** in 2-3 clear sentences 6. **Keep KEY TAKEAWAYS to 5-7 points** - the most important findings 7. **Add to gaps_identified** if expected information is missing 8. **Suggest follow_up_queries** for gaps or incomplete areas 9. **Rate confidence** based on how well results match the user's intent 10. **Include deliverables ONLY if they are in expected_deliverables** or critical to the question 11. **Don't over-optimize** - use a natural, generalized approach that considers all intent fields without forcing connections 12. **For focus_areas_coverage and also_answering_coverage**: Only include entries for focus areas/topics that actually have information in the results. Use null for areas/topics not covered. """ return prompt def _build_persona_context( self, research_persona: Optional[ResearchPersona], industry: Optional[str], target_audience: Optional[str], ) -> str: """Build persona context section for prompts.""" if not research_persona and not industry: return "No specific persona context available." context_parts = [] if research_persona: context_parts.append(f"INDUSTRY: {research_persona.default_industry}") context_parts.append(f"TARGET AUDIENCE: {research_persona.default_target_audience}") if research_persona.suggested_keywords: context_parts.append(f"TYPICAL TOPICS: {', '.join(research_persona.suggested_keywords[:5])}") if research_persona.research_angles: context_parts.append(f"RESEARCH ANGLES: {', '.join(research_persona.research_angles[:3])}") else: if industry: context_parts.append(f"INDUSTRY: {industry}") if target_audience: context_parts.append(f"TARGET AUDIENCE: {target_audience}") return "\n".join(context_parts) def _build_competitor_context(self, competitor_data: Optional[List[Dict]]) -> str: """Build competitor context section for prompts.""" if not competitor_data: return "" competitor_names = [] for comp in competitor_data[:5]: # Limit to 5 name = comp.get("name") or comp.get("domain") or comp.get("url", "Unknown") competitor_names.append(name) if competitor_names: return f"\nKNOWN COMPETITORS: {', '.join(competitor_names)}" return "" def _build_deliverables_instructions(self, expected_deliverables: List[str]) -> str: """Build specific extraction instructions for each expected deliverable.""" instructions = ["### EXTRACTION INSTRUCTIONS\n"] instructions.append("For each requested deliverable, extract the following:\n") deliverable_instructions = { ExpectedDeliverable.KEY_STATISTICS: """ **STATISTICS**: - Extract ALL relevant statistics with exact numbers - Include source attribution (publication name, URL) - Note the recency of the data - Rate credibility based on source authority - Format: statistic statement, value, context, source, URL, credibility score """, ExpectedDeliverable.EXPERT_QUOTES: """ **EXPERT QUOTES**: - Extract authoritative quotes from named experts - Include speaker name, title, and organization - Provide context for the quote - Include source URL """, ExpectedDeliverable.CASE_STUDIES: """ **CASE STUDIES**: - Summarize each case study: challenge → solution → outcome - Include key metrics and results - Name the organization involved - Provide source URL """, ExpectedDeliverable.TRENDS: """ **TRENDS**: - Identify current and emerging trends - Note direction: growing, declining, emerging, or stable - List supporting evidence - Include timeline predictions if available - Cite sources """, ExpectedDeliverable.COMPARISONS: """ **COMPARISONS**: - Build comparison tables where applicable - Define clear comparison criteria - List pros and cons for each option - Provide a verdict/recommendation if data supports it """, ExpectedDeliverable.BEST_PRACTICES: """ **BEST PRACTICES**: - Extract recommended approaches - Provide actionable guidelines - Order by importance or sequence """, ExpectedDeliverable.STEP_BY_STEP: """ **STEP BY STEP**: - Extract process/how-to instructions - Number steps clearly - Include any prerequisites or requirements """, ExpectedDeliverable.PROS_CONS: """ **PROS AND CONS**: - List advantages (pros) - List disadvantages (cons) - Provide a balanced verdict """, ExpectedDeliverable.DEFINITIONS: """ **DEFINITIONS**: - Extract clear explanations of key terms and concepts - Keep definitions concise but comprehensive """, ExpectedDeliverable.EXAMPLES: """ **EXAMPLES**: - Extract concrete examples that illustrate key points - Include real-world applications """, ExpectedDeliverable.PREDICTIONS: """ **PREDICTIONS**: - Extract future outlook and predictions - Note the source and their track record if known - Include timeframes where mentioned """, ExpectedDeliverable.CITATIONS: """ **CITATIONS**: - List all authoritative sources with URLs - Rate credibility and relevance - Note content type (research, news, opinion, etc.) """, } for deliverable in expected_deliverables: try: d_enum = ExpectedDeliverable(deliverable) if d_enum in deliverable_instructions: instructions.append(deliverable_instructions[d_enum]) except ValueError: pass return "\n".join(instructions)