ALwrity/backend/services/image_generation/visual_data_extractor.py

"""
Visual Data Extractor for Image Generation Prompts.

This module provides intelligent extraction of visual-relevant data from blog sections
and research data to generate contextually relevant image prompts.

Key Features:
- Statistics extraction with regex patterns
- Domain-specific visual concept detection
- Research source mining for visual data
- Deduplication and data cleaning
"""

import re
from typing import Dict, List, Optional, Set, Tuple
from dataclasses import dataclass, field


# Pre-compiled regex patterns for performance
_STATISTICAL_PATTERNS: List[Tuple[str, re.Pattern]] = [
    ('percentage', re.compile(r'\d+[\d,]*%', re.IGNORECASE)),
    ('currency', re.compile(r'\$[\d,]+(?:\.\d{2})?', re.IGNORECASE)),
    ('multiplier', re.compile(r'\d+[\d,]*x', re.IGNORECASE)),
    ('large_number', re.compile(r'\d+[\d,]*\s*(?:million|billion|thousand|trillion)s?', re.IGNORECASE)),
    ('range', re.compile(r'\d+\s*-\s*\d+%', re.IGNORECASE)),
    ('change_up', re.compile(r'up\s+by\s+\d+%', re.IGNORECASE)),
    ('change_down', re.compile(r'down\s+by\s+\d+%', re.IGNORECASE)),
    ('growth', re.compile(r'(?:increased|decreased|grew|declined)\s*[\d%]+', re.IGNORECASE)),
    ('cagr', re.compile(r'cagr\s+of\s+[\d.]+%', re.IGNORECASE)),
]

_VISUAL_DATA_PATTERNS: List[Tuple[str, re.Pattern]] = [
    ('times', re.compile(r'\d+\s*(?:times|folds?)', re.IGNORECASE)),
    ('ranking', re.compile(r'rank(?:ed|ing)?\s*(?:#?\d+|first|second|third|top|bottom)', re.IGNORECASE)),
    ('comparison', re.compile(r'(?:vs|versus|compared\s+to|compared\s+with)', re.IGNORECASE)),
    ('chart_mention', re.compile(r'(?:chart|graph|diagram|visual|infographic)', re.IGNORECASE)),
    ('superlative', re.compile(r'(?:best|worst|leading|top|highest|lowest)', re.IGNORECASE)),
]

_TREND_KEYWORDS: Set[str] = {
    'increase', 'decrease', 'growth', 'trend', 'pattern', 'comparison',
    'ranking', 'versus', 'vs', 'rise', 'fall', 'growth', 'decline',
    'surge', 'drop', 'climb', 'jump', 'plummet', 'soar', 'fluctuate'
}


# Domain-specific visual concepts mapping
DOMAIN_VISUAL_CONCEPTS: Dict[str, List[str]] = {
    "tech": [
        "circuit board patterns", "digital interface", "data stream", "network nodes",
        "server racks", "silicon chips", "binary code", "cloud computing",
        "artificial intelligence", "machine learning model", "software code",
        "technology innovation", "digital transformation"
    ],
    "healthcare": [
        "stethoscope", "medical chart", "hospital equipment", "DNA helix",
        "heart rate monitor", "medical cross", "prescription", "patient care",
        "healthcare professional", "medical research", "wellness", "health metrics"
    ],
    "finance": [
        "stock chart", "dollar signs", "investment growth", "banking",
        "pie chart", "financial graph", "portfolio", "market trends",
        "cryptocurrency", "blockchain", "financial analysis", "wealth management"
    ],
    "marketing": [
        "digital marketing", "social media", "content strategy", "audience growth",
        "brand awareness", "conversion funnel", "engagement metrics", "ROI chart",
        "marketing analytics", "customer acquisition", "viral content"
    ],
    "education": [
        "classroom", "graduation cap", "books", "learning curve",
        "knowledge growth", "student achievement", "online learning", "curriculum",
        "educational technology", "academic success", "skill development"
    ],
    "ecommerce": [
        "shopping cart", "product display", "checkout flow", "conversion",
        "customer journey", "inventory", "shipping", "discount tags",
        "online store", "e-commerce analytics", "retail technology"
    ],
    "real_estate": [
        "building", "house", "property", "real estate market",
        "mortgage", "home ownership", "apartment complex", "construction",
        "property investment", "housing market", "architecture"
    ],
    "food": [
        "restaurant", "cooking", "ingredients", "food preparation",
        "recipe", "menu", "dining experience", "culinary arts",
        "gourmet", "food photography", "healthy eating"
    ],
    "travel": [
        "airplane", "destination", "map", "luggage", "passport",
        "tourist", "hotel", "beach resort", "adventure", "travel planning",
        "vacation", "world exploration"
    ],
    "fitness": [
        "gym", "workout", "exercise", "muscle", "weight loss",
        "nutrition", "running", "yoga", "healthy lifestyle", "fitness tracking",
        "sports training", "wellness"
    ],
    "fashion": [
        "clothing", "wardrobe", "style", "runway", "designer",
        "outfit", "accessories", "fashion trends", "personal style", "apparel"
    ],
    "entertainment": [
        "movie reel", "music note", "concert", "celebrity", "streaming",
        "gaming", "content creation", "media production", "creative arts", "performance"
    ],
    "business": [
        "office", "meeting", "presentation", "business growth", "strategy",
        "team collaboration", "enterprise", "corporate", "leadership", "productivity"
    ],
    "science": [
        "laboratory", "microscope", "experiment", "data analysis", "research",
        "scientific method", "discovery", "innovation", "technology development"
    ],
    "sports": [
        "stadium", "athlete", "scoreboard", "trophy", "team",
        "competition", "fitness", "championship", "sports analytics", "training"
    ],
    "legal": [
        "gavel", "courthouse", "legal documents", "scales of justice",
        "law books", "legal contract", "attorney", "courtroom", "compliance"
    ],
    "environmental": [
        "renewable energy", "solar panels", "wind turbines", "green technology",
        "sustainability", "climate change", "eco-friendly", "nature conservation"
    ],
}


@dataclass
class ExtractedVisualData:
    """Data class for extracted visual data."""
    visual_keywords: List[str] = field(default_factory=list)
    data_points: List[str] = field(default_factory=list)
    concepts: List[str] = field(default_factory=list)
    statistics: List[str] = field(default_factory=list)
    domain_concepts: List[str] = field(default_factory=list)
    visual_metaphors: List[str] = field(default_factory=list)
    detected_domains: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, List[str]]:
        """Convert to dictionary for serialization."""
        return {
            "visual_keywords": self.visual_keywords,
            "data_points": self.data_points,
            "concepts": self.concepts,
            "statistics": self.statistics,
            "domain_concepts": self.domain_concepts,
            "visual_metaphors": self.visual_metaphors,
            "detected_domains": self.detected_domains,
        }

    def has_statistics(self) -> bool:
        """Check if any statistics were extracted."""
        return bool(self.statistics)

    def has_data_points(self) -> bool:
        """Check if any data points were extracted."""
        return bool(self.data_points)

    def has_domain_concepts(self) -> bool:
        """Check if any domain concepts were extracted."""
        return bool(self.domain_concepts)

    def is_data_heavy(self) -> bool:
        """Check if content is data-heavy (statistics or data points)."""
        return self.has_statistics() or self.has_data_points()

    def get_recommended_image_type(self) -> str:
        """Get recommended image type based on extracted data."""
        if self.has_statistics() or self.has_data_points():
            return "infographic" if self.has_domain_concepts() else "chart"
        elif self.has_domain_concepts():
            return "conceptual"
        return "conceptual"


def _extract_statistic_with_context(text: str) -> Optional[str]:
    """
    Extract a statistic with surrounding context from text.

    Args:
        text: Input text to search

    Returns:
        Statistic with context (up to 60 chars before + statistic + 30 chars after),
        or None if no statistic found
    """
    for pattern_name, pattern in _STATISTICAL_PATTERNS:
        match = pattern.search(text)
        if match:
            idx = match.start()
            context_start = max(0, idx - 60)
            context_end = min(len(text), match.end() + 30)
            context = text[context_start:context_end].strip()
            # Clean up to word boundaries
            if context_start > 0:
                # Find first space in context
                first_space = context.find(' ')
                if first_space > 0 and first_space < 20:
                    context = context[first_space + 1:]
            return context
    return None


def _has_visual_mention(text: str) -> bool:
    """
    Check if text contains mentions of visual concepts.

    Args:
        text: Input text to check

    Returns:
        True if text contains visual data patterns
    """
    for pattern_name, pattern in _VISUAL_DATA_PATTERNS:
        if pattern.search(text):
            return True
    return False


def _has_trend_keyword(text: str) -> bool:
    """
    Check if text contains trend/comparison keywords.

    Args:
        text: Input text to check

    Returns:
        True if text contains trend keywords
    """
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in _TREND_KEYWORDS)


def _detect_domains_in_text(text: str) -> Tuple[List[str], List[str]]:
    """
    Detect industry/domain from text and return relevant visual concepts.

    Args:
        text: Input text to analyze

    Returns:
        Tuple of (detected_domain_names, domain_concepts)
    """
    text_lower = text.lower()
    detected_domains: List[str] = []
    all_concepts: List[str] = []

    for domain, concepts in DOMAIN_VISUAL_CONCEPTS.items():
        # Check if domain name or any concept keyword is in text
        keywords_to_check = [domain] + concepts[:5]
        if any(keyword in text_lower for keyword in keywords_to_check):
            detected_domains.append(domain)
            # Add top 3 concepts for this domain
            all_concepts.extend(concepts[:3])

    return detected_domains, list(set(all_concepts))


def _deduplicate_and_limit(
    items: List[str],
    max_items: int = 10,
    key_length: int = 50
) -> List[str]:
    """
    Deduplicate items by normalized key and limit count.

    Args:
        items: List of strings to deduplicate
        max_items: Maximum number of items to return
        key_length: Length of normalized key for comparison

    Returns:
        Deduplicated list with max_items items
    """
    seen: Set[str] = set()
    unique_items: List[str] = []

    for item in items:
        if not item or not isinstance(item, str):
            continue
        normalized = item.lower().strip()[:key_length]
        if normalized and normalized not in seen and len(unique_items) < max_items:
            seen.add(normalized)
            unique_items.append(item.strip())

    return unique_items


def extract_visual_data(
    section: Optional[Dict[str, any]],
    research: Optional[Dict[str, any]]
) -> ExtractedVisualData:
    """
    Intelligently extract visual-relevant data from blog section and research.

    This function analyzes section headings, key points, subheadings, keywords,
    and research data to extract statistics, data points, visual concepts,
    and domain-specific visual metaphors.

    Args:
        section: Blog section dictionary with optional keys:
            - heading: Section title
            - subheadings: List of subheading strings
            - key_points: List of key point strings
            - keywords: List of keyword strings
        research: Research data dictionary with optional keys:
            - key_facts, highlights: List of fact strings
            - insights, summary: String or list of insight strings
            - sources, references: List of source dictionaries
            - keywords: Dict or list of keywords
            - domain, industry: Domain/industry string

    Returns:
        ExtractedVisualData dataclass with extracted information

    Example:
        >>> section = {
        ...     "heading": "AI in Healthcare",
        ...     "key_points": ["Market grew 40% in 2023", "Investment reached $5B"]
        ... }
        >>> result = extract_visual_data(section, None)
        >>> result.statistics
        ['Market grew 40% in 2023', 'Investment reached $5B']
        >>> result.domain_concepts
        ['stethoscope', 'medical chart', 'hospital equipment']
    """
    result = ExtractedVisualData()

    # Phase 1: Extract from section
    if section:
        _extract_from_section(section, result)

    # Phase 2: Extract from research
    if research:
        _extract_from_research(research, result)

    # Phase 3: Deduplicate all extracted data
    _deduplicate_results(result)

    return result


def _extract_from_section(section: Dict, result: ExtractedVisualData) -> None:
    """Extract visual data from blog section."""

    # Extract from key points
    key_points = section.get("key_points", []) or []
    for point in key_points[:10]:  # Increased limit
        if not isinstance(point, str):
            continue

        # Check for statistics
        stat = _extract_statistic_with_context(point)
        if stat:
            result.statistics.append(stat)
            # Also detect domains in statistical points
            domains, concepts = _detect_domains_in_text(point)
            result.detected_domains.extend(domains)
            result.domain_concepts.extend(concepts)
            continue

        # Check for visual mentions or trend keywords
        if _has_visual_mention(point) or _has_trend_keyword(point):
            result.data_points.append(point)
        else:
            result.concepts.append(point)
            # Detect domains in regular concepts too
            domains, concepts = _detect_domains_in_text(point)
            result.detected_domains.extend(domains)
            result.domain_concepts.extend(concepts)

    # Extract from subheadings
    subheadings = section.get("subheadings", []) or []
    for subhead in subheadings[:7]:
        if isinstance(subhead, str):
            result.concepts.append(subhead)
            domains, concepts = _detect_domains_in_text(subhead)
            result.detected_domains.extend(domains)
            result.domain_concepts.extend(concepts)

    # Extract from keywords
    keywords = section.get("keywords", []) or []
    for kw in keywords[:12]:
        if kw and isinstance(kw, str):
            result.visual_keywords.append(str(kw))

    # Detect domain from section heading
    heading = section.get("heading", "")
    if heading and isinstance(heading, str):
        domains, concepts = _detect_domains_in_text(heading)
        result.detected_domains.extend(domains)
        result.domain_concepts.extend(concepts)
        # Also add heading as a concept
        if heading.strip():
            result.concepts.insert(0, heading.strip())


def _extract_from_research(research: Dict, result: ExtractedVisualData) -> None:
    """Extract visual data from research data."""

    # Extract from key facts/highlights
    key_facts = research.get("key_facts", []) or research.get("highlights", []) or []
    for fact in key_facts[:7]:
        if isinstance(fact, str):
            stat = _extract_statistic_with_context(fact)
            if stat:
                result.statistics.append(stat)
            else:
                result.data_points.append(fact)

    # Extract from insights/summary
    insights = research.get("insights", []) or research.get("summary", "")
    if isinstance(insights, str) and insights:
        sentences = insights.split('.')[:7]
        for sent in sentences:
            sent = sent.strip()
            if sent:
                stat = _extract_statistic_with_context(sent)
                if stat:
                    result.statistics.append(stat)
                else:
                    result.concepts.append(sent)
    elif isinstance(insights, list):
        for insight in insights[:7]:
            if isinstance(insight, str):
                stat = _extract_statistic_with_context(insight)
                if stat:
                    result.statistics.append(stat)
                else:
                    result.concepts.append(insight)

    # Extract from research sources
    sources = research.get("sources", []) or research.get("references", []) or []
    for source in sources[:7]:
        if not isinstance(source, dict):
            continue

        # Extract from source title
        source_title = source.get("title", "")
        if source_title:
            domains, concepts = _detect_domains_in_text(source_title)
            result.detected_domains.extend(domains)
            result.domain_concepts.extend(concepts)

        # Extract from source excerpt/snippet
        source_excerpt = (
            source.get("excerpt", "")
            or source.get("snippet", "")
            or source.get("description", "")
        )
        if source_excerpt:
            # Extract statistic
            stat = _extract_statistic_with_context(source_excerpt)
            if stat:
                result.statistics.append(stat)

            # Add as data point (limited to 200 chars)
            excerpt_text = source_excerpt[:200] if len(source_excerpt) > 200 else source_excerpt
            result.data_points.append(excerpt_text)

            # Check for visual mentions
            if _has_visual_mention(source_excerpt):
                result.data_points.append(source_excerpt[:300])

            # Detect domains
            domains, concepts = _detect_domains_in_text(source_excerpt)
            result.detected_domains.extend(domains)
            result.domain_concepts.extend(concepts)

    # Extract from research keywords
    research_keywords = research.get("keywords", {})
    if isinstance(research_keywords, dict):
        primary_kw = (
            research_keywords.get("primary_keywords", [])
            or research_keywords.get("primary", [])
            or []
        )
        for kw in primary_kw[:7]:
            if isinstance(kw, str):
                domains, concepts = _detect_domains_in_text(kw)
                result.detected_domains.extend(domains)
                result.domain_concepts.extend(concepts)
    elif isinstance(research_keywords, list):
        for kw in research_keywords[:7]:
            if isinstance(kw, str):
                domains, concepts = _detect_domains_in_text(kw)
                result.detected_domains.extend(domains)
                result.domain_concepts.extend(concepts)

    # Extract from research domain/industry
    research_domain = research.get("domain", "") or research.get("industry", "")
    if research_domain:
        domains, concepts = _detect_domains_in_text(research_domain)
        result.detected_domains.extend(domains)
        result.domain_concepts.extend(concepts)


def _deduplicate_results(result: ExtractedVisualData) -> None:
    """Deduplicate all extracted data."""
    result.visual_keywords = _deduplicate_and_limit(result.visual_keywords, 12)
    result.data_points = _deduplicate_and_limit(result.data_points, 10)
    result.concepts = _deduplicate_and_limit(result.concepts, 10)
    result.statistics = _deduplicate_and_limit(result.statistics, 10)
    result.domain_concepts = _deduplicate_and_limit(result.domain_concepts, 10)
    result.detected_domains = list(set(result.detected_domains))


def get_model_recommendation(visual_data: ExtractedVisualData) -> Optional[str]:
    """
    Get model recommendation based on extracted visual data.

    Args:
        visual_data: ExtractedVisualData instance

    Returns:
        Model recommendation string or None
    """
    if visual_data.is_data_heavy():
        return (
            "\n\nMODEL RECOMMENDATION: This section contains data/statistics. "
            "Consider using:\n"
            "- FLUX Kontext Pro: Best for data visualizations with text labels\n"
            "- GLM-Image: Excellent for infographics and educational diagrams\n"
            "- Ideogram V3 Turbo: Good for simple charts with text overlays"
        )
    elif visual_data.has_domain_concepts():
        return (
            "\n\nMODEL RECOMMENDATION: This section covers domain-specific content. "
            "Consider using:\n"
            "- Qwen Image: Best for abstract conceptual imagery\n"
            "- FLUX Kontext Pro: Good for conceptual imagery with text support\n"
            "- FLUX 2 Flex: Excellent for poster-style conceptual designs"
        )
    return None


def build_visual_summary(visual_data: ExtractedVisualData) -> str:
    """
    Build a text summary from extracted visual data.

    Args:
        visual_data: ExtractedVisualData instance

    Returns:
        Formatted summary string for use in prompts
    """
    parts: List[str] = []

    if visual_data.statistics:
        parts.append(f"Key Statistics: {', '.join(visual_data.statistics[:3])}")

    if visual_data.data_points:
        parts.append(f"Data Points: {', '.join(visual_data.data_points[:3])}")

    if visual_data.concepts:
        parts.append(f"Visual Concepts: {', '.join(visual_data.concepts[:5])}")

    if visual_data.visual_keywords:
        parts.append(f"Keywords: {', '.join(visual_data.visual_keywords[:8])}")

    if visual_data.domain_concepts:
        parts.append(f"Domain Visual Concepts: {', '.join(visual_data.domain_concepts[:5])}")

    if visual_data.detected_domains:
        parts.append(f"Detected Domains: {', '.join(visual_data.detected_domains)}")

    return "\n".join(parts) if parts else ""