Allowing AI to generate suggestions for the blog writer
This commit is contained in:
@@ -24,11 +24,19 @@ from models.blog_models import (
|
||||
BlogPublishRequest,
|
||||
BlogPublishResponse,
|
||||
BlogOutlineSection,
|
||||
ResearchSource,
|
||||
)
|
||||
|
||||
from ..research import ResearchService
|
||||
from ..outline import OutlineService
|
||||
from ..content.enhanced_content_generator import EnhancedContentGenerator
|
||||
from services.llm_providers.gemini_provider import gemini_structured_json_response
|
||||
from services.cache.persistent_content_cache import persistent_content_cache
|
||||
from models.blog_models import (
|
||||
MediumBlogGenerateRequest,
|
||||
MediumBlogGenerateResult,
|
||||
MediumGeneratedSection,
|
||||
)
|
||||
|
||||
|
||||
class BlogWriterService:
|
||||
@@ -258,3 +266,180 @@ class BlogWriterService:
|
||||
"""Publish content to specified platform."""
|
||||
# TODO: Move to content module
|
||||
return BlogPublishResponse(success=True, platform=request.platform, url="https://example.com/post")
|
||||
|
||||
async def generate_medium_blog_with_progress(self, req: MediumBlogGenerateRequest, task_id: str) -> MediumBlogGenerateResult:
|
||||
"""Use Gemini structured JSON to generate a medium-length blog in one call."""
|
||||
import time
|
||||
start = time.time()
|
||||
|
||||
# Prepare sections data for cache key generation
|
||||
sections_for_cache = []
|
||||
for s in req.sections:
|
||||
sections_for_cache.append({
|
||||
"id": s.id,
|
||||
"heading": s.heading,
|
||||
"keyPoints": getattr(s, "key_points", []) or getattr(s, "keyPoints", []),
|
||||
"subheadings": getattr(s, "subheadings", []),
|
||||
"keywords": getattr(s, "keywords", []),
|
||||
"targetWords": getattr(s, "target_words", None) or getattr(s, "targetWords", None),
|
||||
})
|
||||
|
||||
# Check cache first
|
||||
cached_result = persistent_content_cache.get_cached_content(
|
||||
keywords=req.researchKeywords or [],
|
||||
sections=sections_for_cache,
|
||||
global_target_words=req.globalTargetWords or 1000,
|
||||
persona_data=req.persona.dict() if req.persona else None,
|
||||
tone=req.tone,
|
||||
audience=req.audience
|
||||
)
|
||||
|
||||
if cached_result:
|
||||
logger.info(f"Using cached content for keywords: {req.researchKeywords} (saved expensive generation)")
|
||||
# Add cache hit marker to distinguish from fresh generation
|
||||
cached_result['generation_time_ms'] = 0 # Mark as cache hit
|
||||
cached_result['cache_hit'] = True
|
||||
return MediumBlogGenerateResult(**cached_result)
|
||||
|
||||
# Cache miss - proceed with AI generation
|
||||
logger.info(f"Cache miss - generating new content for keywords: {req.researchKeywords}")
|
||||
|
||||
# Build schema expected from the model
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"sections": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {"type": "string"},
|
||||
"heading": {"type": "string"},
|
||||
"content": {"type": "string"},
|
||||
"wordCount": {"type": "number"},
|
||||
"sources": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {"title": {"type": "string"}, "url": {"type": "string"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Compose prompt
|
||||
def section_block(s):
|
||||
return {
|
||||
"id": s.id,
|
||||
"heading": s.heading,
|
||||
"outline": {
|
||||
"keyPoints": getattr(s, "key_points", []) or getattr(s, "keyPoints", []),
|
||||
"subheadings": getattr(s, "subheadings", []),
|
||||
"keywords": getattr(s, "keywords", []),
|
||||
"targetWords": getattr(s, "target_words", None) or getattr(s, "targetWords", None),
|
||||
"references": [
|
||||
{"title": r.title, "url": r.url} for r in getattr(s, "references", [])
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
payload = {
|
||||
"title": req.title,
|
||||
"globalTargetWords": req.globalTargetWords or 1000,
|
||||
"persona": req.persona.dict() if req.persona else None,
|
||||
"tone": req.tone,
|
||||
"audience": req.audience,
|
||||
"sections": [section_block(s) for s in req.sections],
|
||||
}
|
||||
|
||||
system = (
|
||||
"You are a professional blog writer. Generate high-quality content for each section based on the provided outline. "
|
||||
"Write engaging, informative content that follows the section's key points and target word count. "
|
||||
"Use a professional tone and ensure the content flows naturally. "
|
||||
"Format content with proper paragraph breaks using double line breaks (\\n\\n) between paragraphs. "
|
||||
"Structure content with clear paragraphs - aim for 2-4 sentences per paragraph. "
|
||||
"Return ONLY valid JSON with no markdown formatting or explanations."
|
||||
)
|
||||
|
||||
import json
|
||||
prompt = (
|
||||
f"Write blog content for the following sections. Each section should be {req.globalTargetWords or 1000} words total, distributed across all sections.\n\n"
|
||||
f"Blog Title: {req.title}\n\n"
|
||||
"For each section, write engaging content that:\n"
|
||||
"- Follows the key points provided\n"
|
||||
"- Uses the suggested keywords naturally\n"
|
||||
"- Meets the target word count\n"
|
||||
"- Maintains professional tone\n"
|
||||
"- References the provided sources when relevant\n"
|
||||
"- Breaks content into clear paragraphs (2-4 sentences each)\n"
|
||||
"- Uses double line breaks (\\n\\n) between paragraphs for proper formatting\n"
|
||||
"- Starts with an engaging opening paragraph\n"
|
||||
"- Ends with a strong concluding paragraph\n\n"
|
||||
"IMPORTANT: Format the 'content' field with proper paragraph breaks using \\n\\n between paragraphs.\n\n"
|
||||
"Return a JSON object with 'title' and 'sections' array. Each section should have 'id', 'heading', 'content', and 'wordCount'.\n\n"
|
||||
f"Sections to write:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
|
||||
)
|
||||
|
||||
ai_resp = gemini_structured_json_response(
|
||||
prompt=prompt,
|
||||
schema=schema,
|
||||
temperature=0.2,
|
||||
max_tokens=8192,
|
||||
system_prompt=system,
|
||||
)
|
||||
|
||||
# Check for errors in AI response
|
||||
if not ai_resp or ai_resp.get("error"):
|
||||
error_msg = ai_resp.get("error", "Empty generation result from model") if ai_resp else "No response from model"
|
||||
logger.error(f"AI generation failed: {error_msg}")
|
||||
raise Exception(f"AI generation failed: {error_msg}")
|
||||
|
||||
# Normalize output
|
||||
title = ai_resp.get("title") or req.title
|
||||
out_sections = []
|
||||
for s in ai_resp.get("sections", []) or []:
|
||||
out_sections.append(
|
||||
MediumGeneratedSection(
|
||||
id=str(s.get("id")),
|
||||
heading=s.get("heading") or "",
|
||||
content=s.get("content") or "",
|
||||
wordCount=int(s.get("wordCount") or 0),
|
||||
sources=[
|
||||
# map to ResearchSource shape if possible; keep minimal
|
||||
ResearchSource(title=src.get("title", ""), url=src.get("url", ""))
|
||||
for src in (s.get("sources") or [])
|
||||
] or None,
|
||||
)
|
||||
)
|
||||
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
result = MediumBlogGenerateResult(
|
||||
success=True,
|
||||
title=title,
|
||||
sections=out_sections,
|
||||
model="gemini-2.5-flash",
|
||||
generation_time_ms=duration_ms,
|
||||
safety_flags=None,
|
||||
)
|
||||
|
||||
# Cache the result for future use
|
||||
try:
|
||||
persistent_content_cache.cache_content(
|
||||
keywords=req.researchKeywords or [],
|
||||
sections=sections_for_cache,
|
||||
global_target_words=req.globalTargetWords or 1000,
|
||||
persona_data=req.persona.dict() if req.persona else None,
|
||||
tone=req.tone or "professional",
|
||||
audience=req.audience or "general",
|
||||
result=result.dict()
|
||||
)
|
||||
logger.info(f"Cached content result for keywords: {req.researchKeywords}")
|
||||
except Exception as cache_error:
|
||||
logger.warning(f"Failed to cache content result: {cache_error}")
|
||||
# Don't fail the entire operation if caching fails
|
||||
|
||||
return result
|
||||
|
||||
363
backend/services/cache/persistent_content_cache.py
vendored
Normal file
363
backend/services/cache/persistent_content_cache.py
vendored
Normal file
@@ -0,0 +1,363 @@
|
||||
"""
|
||||
Persistent Content Cache Service
|
||||
|
||||
Provides database-backed caching for blog content generation results to survive server restarts
|
||||
and provide better cache management across multiple instances.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import sqlite3
|
||||
from typing import Dict, Any, Optional, List
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class PersistentContentCache:
|
||||
"""Database-backed cache for blog content generation results with exact parameter matching."""
|
||||
|
||||
def __init__(self, db_path: str = "content_cache.db", max_cache_size: int = 300, cache_ttl_hours: int = 72):
|
||||
"""
|
||||
Initialize the persistent content cache.
|
||||
|
||||
Args:
|
||||
db_path: Path to SQLite database file
|
||||
max_cache_size: Maximum number of cached entries
|
||||
cache_ttl_hours: Time-to-live for cache entries in hours (longer than research cache since content is expensive)
|
||||
"""
|
||||
self.db_path = db_path
|
||||
self.max_cache_size = max_cache_size
|
||||
self.cache_ttl = timedelta(hours=cache_ttl_hours)
|
||||
|
||||
# Ensure database directory exists
|
||||
Path(db_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Initialize database
|
||||
self._init_database()
|
||||
|
||||
def _init_database(self):
|
||||
"""Initialize the SQLite database with required tables."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS content_cache (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
cache_key TEXT UNIQUE NOT NULL,
|
||||
title TEXT NOT NULL,
|
||||
sections_hash TEXT NOT NULL,
|
||||
global_target_words INTEGER NOT NULL,
|
||||
persona_data TEXT,
|
||||
tone TEXT,
|
||||
audience TEXT,
|
||||
result_data TEXT NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
expires_at TIMESTAMP NOT NULL,
|
||||
access_count INTEGER DEFAULT 0,
|
||||
last_accessed TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
# Create indexes for better performance
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_content_cache_key ON content_cache(cache_key)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_content_expires_at ON content_cache(expires_at)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_content_created_at ON content_cache(created_at)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_content_title ON content_cache(title)")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def _generate_sections_hash(self, sections: List[Dict[str, Any]]) -> str:
|
||||
"""
|
||||
Generate a hash for sections based on their structure and content.
|
||||
|
||||
Args:
|
||||
sections: List of section dictionaries with outline information
|
||||
|
||||
Returns:
|
||||
MD5 hash of the normalized sections
|
||||
"""
|
||||
# Normalize sections for consistent hashing
|
||||
normalized_sections = []
|
||||
for section in sections:
|
||||
normalized_section = {
|
||||
'id': section.get('id', ''),
|
||||
'heading': section.get('heading', '').lower().strip(),
|
||||
'keyPoints': sorted([str(kp).lower().strip() for kp in section.get('keyPoints', [])]),
|
||||
'keywords': sorted([str(kw).lower().strip() for kw in section.get('keywords', [])]),
|
||||
'subheadings': sorted([str(sh).lower().strip() for sh in section.get('subheadings', [])]),
|
||||
'targetWords': section.get('targetWords', 0),
|
||||
# Don't include references in hash as they might vary but content should remain similar
|
||||
}
|
||||
normalized_sections.append(normalized_section)
|
||||
|
||||
# Sort sections by id for consistent ordering
|
||||
normalized_sections.sort(key=lambda x: x['id'])
|
||||
|
||||
# Generate hash
|
||||
sections_str = json.dumps(normalized_sections, sort_keys=True)
|
||||
return hashlib.md5(sections_str.encode('utf-8')).hexdigest()
|
||||
|
||||
def _generate_cache_key(self, keywords: List[str], sections: List[Dict[str, Any]],
|
||||
global_target_words: int, persona_data: Dict = None,
|
||||
tone: str = None, audience: str = None) -> str:
|
||||
"""
|
||||
Generate a cache key based on exact parameter match.
|
||||
|
||||
Args:
|
||||
keywords: Original research keywords (primary cache key)
|
||||
sections: List of section dictionaries with outline information
|
||||
global_target_words: Target word count for entire blog
|
||||
persona_data: Persona information
|
||||
tone: Content tone
|
||||
audience: Target audience
|
||||
|
||||
Returns:
|
||||
MD5 hash of the normalized parameters
|
||||
"""
|
||||
# Normalize parameters
|
||||
normalized_keywords = sorted([kw.lower().strip() for kw in (keywords or [])])
|
||||
sections_hash = self._generate_sections_hash(sections)
|
||||
normalized_tone = tone.lower().strip() if tone else "professional"
|
||||
normalized_audience = audience.lower().strip() if audience else "general"
|
||||
|
||||
# Normalize persona data
|
||||
normalized_persona = ""
|
||||
if persona_data:
|
||||
# Sort persona keys and values for consistent hashing
|
||||
persona_str = json.dumps(persona_data, sort_keys=True, default=str)
|
||||
normalized_persona = persona_str.lower()
|
||||
|
||||
# Create a consistent string representation
|
||||
cache_string = f"{normalized_keywords}|{sections_hash}|{global_target_words}|{normalized_tone}|{normalized_audience}|{normalized_persona}"
|
||||
|
||||
# Generate MD5 hash
|
||||
return hashlib.md5(cache_string.encode('utf-8')).hexdigest()
|
||||
|
||||
def _cleanup_expired_entries(self):
|
||||
"""Remove expired cache entries from database."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute(
|
||||
"DELETE FROM content_cache WHERE expires_at < ?",
|
||||
(datetime.now().isoformat(),)
|
||||
)
|
||||
deleted_count = cursor.rowcount
|
||||
if deleted_count > 0:
|
||||
logger.debug(f"Removed {deleted_count} expired content cache entries")
|
||||
conn.commit()
|
||||
|
||||
def _evict_oldest_entries(self, num_to_evict: int):
|
||||
"""Evict the oldest cache entries when cache is full."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
# Get oldest entries by creation time
|
||||
cursor = conn.execute("""
|
||||
SELECT id FROM content_cache
|
||||
ORDER BY created_at ASC
|
||||
LIMIT ?
|
||||
""", (num_to_evict,))
|
||||
|
||||
old_ids = [row[0] for row in cursor.fetchall()]
|
||||
|
||||
if old_ids:
|
||||
placeholders = ','.join(['?' for _ in old_ids])
|
||||
conn.execute(f"DELETE FROM content_cache WHERE id IN ({placeholders})", old_ids)
|
||||
logger.debug(f"Evicted {len(old_ids)} oldest content cache entries")
|
||||
|
||||
conn.commit()
|
||||
|
||||
def get_cached_content(self, keywords: List[str], sections: List[Dict[str, Any]],
|
||||
global_target_words: int, persona_data: Dict = None,
|
||||
tone: str = None, audience: str = None) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get cached content result for exact parameter match.
|
||||
|
||||
Args:
|
||||
keywords: Original research keywords (primary cache key)
|
||||
sections: List of section dictionaries with outline information
|
||||
global_target_words: Target word count for entire blog
|
||||
persona_data: Persona information
|
||||
tone: Content tone
|
||||
audience: Target audience
|
||||
|
||||
Returns:
|
||||
Cached content result if found and valid, None otherwise
|
||||
"""
|
||||
cache_key = self._generate_cache_key(keywords, sections, global_target_words, persona_data, tone, audience)
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("""
|
||||
SELECT result_data, expires_at FROM content_cache
|
||||
WHERE cache_key = ? AND expires_at > ?
|
||||
""", (cache_key, datetime.now().isoformat()))
|
||||
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row is None:
|
||||
logger.debug(f"Content cache miss for keywords: {keywords}, sections: {len(sections)}")
|
||||
return None
|
||||
|
||||
# Update access statistics
|
||||
conn.execute("""
|
||||
UPDATE content_cache
|
||||
SET access_count = access_count + 1, last_accessed = CURRENT_TIMESTAMP
|
||||
WHERE cache_key = ?
|
||||
""", (cache_key,))
|
||||
conn.commit()
|
||||
|
||||
try:
|
||||
result_data = json.loads(row[0])
|
||||
logger.info(f"Content cache hit for keywords: {keywords} (saved expensive generation)")
|
||||
return result_data
|
||||
except json.JSONDecodeError:
|
||||
logger.error(f"Invalid JSON in content cache for keywords: {keywords}")
|
||||
# Remove invalid entry
|
||||
conn.execute("DELETE FROM content_cache WHERE cache_key = ?", (cache_key,))
|
||||
conn.commit()
|
||||
return None
|
||||
|
||||
def cache_content(self, keywords: List[str], sections: List[Dict[str, Any]],
|
||||
global_target_words: int, persona_data: Dict, tone: str,
|
||||
audience: str, result: Dict[str, Any]):
|
||||
"""
|
||||
Cache a content generation result.
|
||||
|
||||
Args:
|
||||
keywords: Original research keywords (primary cache key)
|
||||
sections: List of section dictionaries with outline information
|
||||
global_target_words: Target word count for entire blog
|
||||
persona_data: Persona information
|
||||
tone: Content tone
|
||||
audience: Target audience
|
||||
result: Content result to cache
|
||||
"""
|
||||
cache_key = self._generate_cache_key(keywords, sections, global_target_words, persona_data, tone, audience)
|
||||
sections_hash = self._generate_sections_hash(sections)
|
||||
|
||||
# Cleanup expired entries first
|
||||
self._cleanup_expired_entries()
|
||||
|
||||
# Check if cache is full and evict if necessary
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM content_cache")
|
||||
current_count = cursor.fetchone()[0]
|
||||
|
||||
if current_count >= self.max_cache_size:
|
||||
num_to_evict = current_count - self.max_cache_size + 1
|
||||
self._evict_oldest_entries(num_to_evict)
|
||||
|
||||
# Store the result
|
||||
expires_at = datetime.now() + self.cache_ttl
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("""
|
||||
INSERT OR REPLACE INTO content_cache
|
||||
(cache_key, title, sections_hash, global_target_words, persona_data, tone, audience, result_data, expires_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
cache_key,
|
||||
json.dumps(keywords), # Store keywords as JSON
|
||||
sections_hash,
|
||||
global_target_words,
|
||||
json.dumps(persona_data) if persona_data else "",
|
||||
tone or "",
|
||||
audience or "",
|
||||
json.dumps(result),
|
||||
expires_at.isoformat()
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
logger.info(f"Cached content result for keywords: {keywords}, {len(sections)} sections")
|
||||
|
||||
def get_cache_stats(self) -> Dict[str, Any]:
|
||||
"""Get cache statistics."""
|
||||
self._cleanup_expired_entries()
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
# Get basic stats
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM content_cache")
|
||||
total_entries = cursor.fetchone()[0]
|
||||
|
||||
cursor = conn.execute("SELECT COUNT(*) FROM content_cache WHERE expires_at > ?", (datetime.now().isoformat(),))
|
||||
valid_entries = cursor.fetchone()[0]
|
||||
|
||||
# Get most accessed entries
|
||||
cursor = conn.execute("""
|
||||
SELECT title, global_target_words, access_count, created_at
|
||||
FROM content_cache
|
||||
ORDER BY access_count DESC
|
||||
LIMIT 10
|
||||
""")
|
||||
top_entries = [
|
||||
{
|
||||
'title': row[0],
|
||||
'global_target_words': row[1],
|
||||
'access_count': row[2],
|
||||
'created_at': row[3]
|
||||
}
|
||||
for row in cursor.fetchall()
|
||||
]
|
||||
|
||||
# Get database size
|
||||
cursor = conn.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
|
||||
db_size_bytes = cursor.fetchone()[0]
|
||||
db_size_mb = db_size_bytes / (1024 * 1024)
|
||||
|
||||
return {
|
||||
'total_entries': total_entries,
|
||||
'valid_entries': valid_entries,
|
||||
'expired_entries': total_entries - valid_entries,
|
||||
'max_size': self.max_cache_size,
|
||||
'ttl_hours': self.cache_ttl.total_seconds() / 3600,
|
||||
'database_size_mb': round(db_size_mb, 2),
|
||||
'top_accessed_entries': top_entries
|
||||
}
|
||||
|
||||
def clear_cache(self):
|
||||
"""Clear all cached entries."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
conn.execute("DELETE FROM content_cache")
|
||||
conn.commit()
|
||||
logger.info("Content cache cleared")
|
||||
|
||||
def get_cache_entries(self, limit: int = 50) -> List[Dict[str, Any]]:
|
||||
"""Get recent cache entries for debugging."""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("""
|
||||
SELECT title, global_target_words, tone, audience, created_at, expires_at, access_count
|
||||
FROM content_cache
|
||||
ORDER BY created_at DESC
|
||||
LIMIT ?
|
||||
""", (limit,))
|
||||
|
||||
return [
|
||||
{
|
||||
'title': row[0],
|
||||
'global_target_words': row[1],
|
||||
'tone': row[2],
|
||||
'audience': row[3],
|
||||
'created_at': row[4],
|
||||
'expires_at': row[5],
|
||||
'access_count': row[6]
|
||||
}
|
||||
for row in cursor.fetchall()
|
||||
]
|
||||
|
||||
def invalidate_cache_for_title(self, title: str):
|
||||
"""
|
||||
Invalidate all cache entries for specific title.
|
||||
Useful when outline is updated.
|
||||
|
||||
Args:
|
||||
title: Title to invalidate cache for
|
||||
"""
|
||||
normalized_title = title.lower().strip()
|
||||
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
cursor = conn.execute("DELETE FROM content_cache WHERE LOWER(title) = ?", (normalized_title,))
|
||||
deleted_count = cursor.rowcount
|
||||
conn.commit()
|
||||
|
||||
if deleted_count > 0:
|
||||
logger.info(f"Invalidated {deleted_count} content cache entries for title: {title}")
|
||||
|
||||
|
||||
# Global persistent cache instance
|
||||
persistent_content_cache = PersistentContentCache()
|
||||
@@ -407,11 +407,50 @@ def gemini_structured_json_response(prompt, schema, temperature=0.7, top_p=0.9,
|
||||
logger.info("No parsed content, trying to parse text response")
|
||||
try:
|
||||
import json
|
||||
parsed_text = json.loads(response.text)
|
||||
import re
|
||||
|
||||
# Clean the text response to fix common JSON issues
|
||||
cleaned_text = response.text.strip()
|
||||
|
||||
# Remove any markdown code blocks if present
|
||||
if cleaned_text.startswith('```json'):
|
||||
cleaned_text = cleaned_text[7:]
|
||||
if cleaned_text.endswith('```'):
|
||||
cleaned_text = cleaned_text[:-3]
|
||||
cleaned_text = cleaned_text.strip()
|
||||
|
||||
# Try to find JSON content between curly braces
|
||||
json_match = re.search(r'\{.*\}', cleaned_text, re.DOTALL)
|
||||
if json_match:
|
||||
cleaned_text = json_match.group(0)
|
||||
|
||||
parsed_text = json.loads(cleaned_text)
|
||||
logger.info("Successfully parsed text as JSON")
|
||||
return parsed_text
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse text as JSON: {e}")
|
||||
logger.debug(f"Problematic text (first 500 chars): {response.text[:500]}")
|
||||
|
||||
# Try to extract and fix JSON manually
|
||||
try:
|
||||
import re
|
||||
# Look for the main JSON object
|
||||
json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
|
||||
matches = re.findall(json_pattern, response.text, re.DOTALL)
|
||||
if matches:
|
||||
# Try the largest match (likely the main JSON)
|
||||
largest_match = max(matches, key=len)
|
||||
# Basic cleanup of common issues
|
||||
fixed_json = largest_match.replace('\n', ' ').replace('\r', ' ')
|
||||
# Remove any trailing commas before closing braces
|
||||
fixed_json = re.sub(r',\s*}', '}', fixed_json)
|
||||
fixed_json = re.sub(r',\s*]', ']', fixed_json)
|
||||
|
||||
parsed_text = json.loads(fixed_json)
|
||||
logger.info("Successfully parsed cleaned JSON")
|
||||
return parsed_text
|
||||
except Exception as fix_error:
|
||||
logger.error(f"Failed to fix JSON manually: {fix_error}")
|
||||
|
||||
# Check candidates for content (fallback for edge cases)
|
||||
if hasattr(response, 'candidates') and response.candidates:
|
||||
|
||||
Reference in New Issue
Block a user