Base code

2026-01-08 22:39:53 +07:00
parent 697115c61a
commit c35fa52117
2169 changed files with 626670 additions and 0 deletions
--- a/backend/services/cache/research_cache.py
+++ b/backend/services/cache/research_cache.py
@@ -0,0 +1,172 @@
+"""
+Research Cache Service
+
+Provides intelligent caching for Google grounded research results to reduce API costs.
+Only returns cached results for exact keyword matches to ensure accuracy.
+"""
+
+import hashlib
+import json
+from typing import Dict, Any, Optional, List
+from datetime import datetime, timedelta
+from loguru import logger
+
+
+class ResearchCache:
+    """Cache for research results with exact keyword matching."""
+    
+    def __init__(self, max_cache_size: int = 100, cache_ttl_hours: int = 24):
+        """
+        Initialize the research cache.
+        
+        Args:
+            max_cache_size: Maximum number of cached entries
+            cache_ttl_hours: Time-to-live for cache entries in hours
+        """
+        self.cache: Dict[str, Dict[str, Any]] = {}
+        self.max_cache_size = max_cache_size
+        self.cache_ttl = timedelta(hours=cache_ttl_hours)
+    
+    def _generate_cache_key(self, keywords: List[str], industry: str, target_audience: str) -> str:
+        """
+        Generate a cache key based on exact keyword match.
+        
+        Args:
+            keywords: List of research keywords
+            industry: Industry context
+            target_audience: Target audience context
+            
+        Returns:
+            MD5 hash of the normalized parameters
+        """
+        # Normalize and sort keywords for consistent hashing
+        normalized_keywords = sorted([kw.lower().strip() for kw in keywords])
+        normalized_industry = industry.lower().strip() if industry else "general"
+        normalized_audience = target_audience.lower().strip() if target_audience else "general"
+        
+        # Create a consistent string representation
+        cache_string = f"{normalized_keywords}|{normalized_industry}|{normalized_audience}"
+        
+        # Generate MD5 hash
+        return hashlib.md5(cache_string.encode('utf-8')).hexdigest()
+    
+    def _is_cache_entry_valid(self, entry: Dict[str, Any]) -> bool:
+        """Check if a cache entry is still valid (not expired)."""
+        if 'created_at' not in entry:
+            return False
+        
+        created_at = datetime.fromisoformat(entry['created_at'])
+        return datetime.now() - created_at < self.cache_ttl
+    
+    def _cleanup_expired_entries(self):
+        """Remove expired cache entries."""
+        expired_keys = []
+        for key, entry in self.cache.items():
+            if not self._is_cache_entry_valid(entry):
+                expired_keys.append(key)
+        
+        for key in expired_keys:
+            del self.cache[key]
+            logger.debug(f"Removed expired cache entry: {key}")
+    
+    def _evict_oldest_entries(self, num_to_evict: int):
+        """Evict the oldest cache entries when cache is full."""
+        # Sort by creation time and remove oldest entries
+        sorted_entries = sorted(
+            self.cache.items(),
+            key=lambda x: x[1].get('created_at', ''),
+            reverse=False
+        )
+        
+        for i in range(min(num_to_evict, len(sorted_entries))):
+            key = sorted_entries[i][0]
+            del self.cache[key]
+            logger.debug(f"Evicted oldest cache entry: {key}")
+    
+    def get_cached_result(self, keywords: List[str], industry: str, target_audience: str) -> Optional[Dict[str, Any]]:
+        """
+        Get cached research result for exact keyword match.
+        
+        Args:
+            keywords: List of research keywords
+            industry: Industry context
+            target_audience: Target audience context
+            
+        Returns:
+            Cached research result if found and valid, None otherwise
+        """
+        cache_key = self._generate_cache_key(keywords, industry, target_audience)
+        
+        if cache_key not in self.cache:
+            logger.debug(f"Cache miss for keywords: {keywords}")
+            return None
+        
+        entry = self.cache[cache_key]
+        
+        # Check if entry is still valid
+        if not self._is_cache_entry_valid(entry):
+            del self.cache[cache_key]
+            logger.debug(f"Cache entry expired for keywords: {keywords}")
+            return None
+        
+        logger.info(f"Cache hit for keywords: {keywords} (saved API call)")
+        return entry.get('result')
+    
+    def cache_result(self, keywords: List[str], industry: str, target_audience: str, result: Dict[str, Any]):
+        """
+        Cache a research result.
+        
+        Args:
+            keywords: List of research keywords
+            industry: Industry context
+            target_audience: Target audience context
+            result: Research result to cache
+        """
+        cache_key = self._generate_cache_key(keywords, industry, target_audience)
+        
+        # Cleanup expired entries first
+        self._cleanup_expired_entries()
+        
+        # Check if cache is full and evict if necessary
+        if len(self.cache) >= self.max_cache_size:
+            num_to_evict = len(self.cache) - self.max_cache_size + 1
+            self._evict_oldest_entries(num_to_evict)
+        
+        # Store the result
+        self.cache[cache_key] = {
+            'result': result,
+            'created_at': datetime.now().isoformat(),
+            'keywords': keywords,
+            'industry': industry,
+            'target_audience': target_audience
+        }
+        
+        logger.info(f"Cached research result for keywords: {keywords}")
+    
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """Get cache statistics."""
+        self._cleanup_expired_entries()
+        
+        return {
+            'total_entries': len(self.cache),
+            'max_size': self.max_cache_size,
+            'ttl_hours': self.cache_ttl.total_seconds() / 3600,
+            'entries': [
+                {
+                    'keywords': entry['keywords'],
+                    'industry': entry['industry'],
+                    'target_audience': entry['target_audience'],
+                    'created_at': entry['created_at']
+                }
+                for entry in self.cache.values()
+            ]
+        }
+    
+    def clear_cache(self):
+        """Clear all cached entries."""
+        self.cache.clear()
+        logger.info("Research cache cleared")
+
+
+# Global cache instance
+research_cache = ResearchCache()