Base code
This commit is contained in:
209
backend/services/blog_writer/content/blog_rewriter.py
Normal file
209
backend/services/blog_writer/content/blog_rewriter.py
Normal file
@@ -0,0 +1,209 @@
|
||||
"""
|
||||
Blog Rewriter Service
|
||||
|
||||
Handles blog rewriting based on user feedback using structured AI calls.
|
||||
"""
|
||||
|
||||
import time
|
||||
import uuid
|
||||
from typing import Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
from services.llm_providers.gemini_provider import gemini_structured_json_response
|
||||
|
||||
|
||||
class BlogRewriter:
|
||||
"""Service for rewriting blog content based on user feedback."""
|
||||
|
||||
def __init__(self, task_manager):
|
||||
self.task_manager = task_manager
|
||||
|
||||
def start_blog_rewrite(self, request: Dict[str, Any]) -> str:
|
||||
"""Start blog rewrite task with user feedback."""
|
||||
try:
|
||||
# Extract request data
|
||||
title = request.get("title", "Untitled Blog")
|
||||
sections = request.get("sections", [])
|
||||
research = request.get("research", {})
|
||||
outline = request.get("outline", [])
|
||||
feedback = request.get("feedback", "")
|
||||
tone = request.get("tone")
|
||||
audience = request.get("audience")
|
||||
focus = request.get("focus")
|
||||
|
||||
if not sections:
|
||||
raise ValueError("No sections provided for rewrite")
|
||||
|
||||
if not feedback or len(feedback.strip()) < 10:
|
||||
raise ValueError("Feedback is required and must be at least 10 characters")
|
||||
|
||||
# Create task for rewrite
|
||||
task_id = f"rewrite_{int(time.time())}_{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# Start the rewrite task
|
||||
self.task_manager.start_task(
|
||||
task_id,
|
||||
self._execute_blog_rewrite,
|
||||
title=title,
|
||||
sections=sections,
|
||||
research=research,
|
||||
outline=outline,
|
||||
feedback=feedback,
|
||||
tone=tone,
|
||||
audience=audience,
|
||||
focus=focus
|
||||
)
|
||||
|
||||
logger.info(f"Blog rewrite task started: {task_id}")
|
||||
return task_id
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start blog rewrite: {e}")
|
||||
raise
|
||||
|
||||
async def _execute_blog_rewrite(self, task_id: str, **kwargs):
|
||||
"""Execute the blog rewrite task."""
|
||||
try:
|
||||
title = kwargs.get("title", "Untitled Blog")
|
||||
sections = kwargs.get("sections", [])
|
||||
research = kwargs.get("research", {})
|
||||
outline = kwargs.get("outline", [])
|
||||
feedback = kwargs.get("feedback", "")
|
||||
tone = kwargs.get("tone")
|
||||
audience = kwargs.get("audience")
|
||||
focus = kwargs.get("focus")
|
||||
|
||||
# Update task status
|
||||
self.task_manager.update_task_status(task_id, "processing", "Analyzing current content and feedback...")
|
||||
|
||||
# Build rewrite prompt with user feedback
|
||||
system_prompt = f"""You are an expert blog writer tasked with rewriting content based on user feedback.
|
||||
|
||||
Current Blog Title: {title}
|
||||
User Feedback: {feedback}
|
||||
{f"Desired Tone: {tone}" if tone else ""}
|
||||
{f"Target Audience: {audience}" if audience else ""}
|
||||
{f"Focus Area: {focus}" if focus else ""}
|
||||
|
||||
Your task is to rewrite the blog content to address the user's feedback while maintaining the core structure and research insights."""
|
||||
|
||||
# Prepare content for rewrite
|
||||
full_content = f"Title: {title}\n\n"
|
||||
for section in sections:
|
||||
full_content += f"Section: {section.get('heading', 'Untitled')}\n"
|
||||
full_content += f"Content: {section.get('content', '')}\n\n"
|
||||
|
||||
# Create rewrite prompt
|
||||
rewrite_prompt = f"""
|
||||
Based on the user feedback and current blog content, rewrite the blog to address their concerns and preferences.
|
||||
|
||||
Current Content:
|
||||
{full_content}
|
||||
|
||||
User Feedback: {feedback}
|
||||
{f"Desired Tone: {tone}" if tone else ""}
|
||||
{f"Target Audience: {audience}" if audience else ""}
|
||||
{f"Focus Area: {focus}" if focus else ""}
|
||||
|
||||
Please rewrite the blog content in the following JSON format:
|
||||
{{
|
||||
"title": "New or improved blog title",
|
||||
"sections": [
|
||||
{{
|
||||
"id": "section_id",
|
||||
"heading": "Section heading",
|
||||
"content": "Rewritten section content"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
Guidelines:
|
||||
1. Address the user's feedback directly
|
||||
2. Maintain the research insights and factual accuracy
|
||||
3. Improve flow, clarity, and engagement
|
||||
4. Keep the same section structure unless feedback suggests otherwise
|
||||
5. Ensure content is well-formatted with proper paragraphs
|
||||
"""
|
||||
|
||||
# Update task status
|
||||
self.task_manager.update_task_status(task_id, "processing", "Generating rewritten content...")
|
||||
|
||||
# Use structured JSON generation
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"sections": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {"type": "string"},
|
||||
"heading": {"type": "string"},
|
||||
"content": {"type": "string"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result = gemini_structured_json_response(
|
||||
prompt=rewrite_prompt,
|
||||
schema=schema,
|
||||
temperature=0.7,
|
||||
max_tokens=4096,
|
||||
system_prompt=system_prompt
|
||||
)
|
||||
|
||||
logger.info(f"Gemini response for rewrite task {task_id}: {result}")
|
||||
|
||||
# Check if we have a valid result - handle both multi-section and single-section formats
|
||||
is_valid_multi_section = result and not result.get("error") and result.get("title") and result.get("sections")
|
||||
is_valid_single_section = result and not result.get("error") and (result.get("heading") or result.get("title")) and result.get("content")
|
||||
|
||||
if is_valid_multi_section or is_valid_single_section:
|
||||
# If single section format, convert to multi-section format for consistency
|
||||
if is_valid_single_section and not is_valid_multi_section:
|
||||
# Convert single section to multi-section format
|
||||
converted_result = {
|
||||
"title": result.get("heading") or result.get("title") or "Rewritten Blog",
|
||||
"sections": [
|
||||
{
|
||||
"id": result.get("id") or "section_1",
|
||||
"heading": result.get("heading") or "Main Content",
|
||||
"content": result.get("content", "")
|
||||
}
|
||||
]
|
||||
}
|
||||
result = converted_result
|
||||
logger.info(f"Converted single section response to multi-section format for task {task_id}")
|
||||
|
||||
# Update task status with success
|
||||
self.task_manager.update_task_status(
|
||||
task_id,
|
||||
"completed",
|
||||
"Blog rewrite completed successfully!",
|
||||
result=result
|
||||
)
|
||||
logger.info(f"Blog rewrite completed successfully: {task_id}")
|
||||
else:
|
||||
# More detailed error handling
|
||||
if not result:
|
||||
error_msg = "No response from AI"
|
||||
elif result.get("error"):
|
||||
error_msg = f"AI error: {result.get('error')}"
|
||||
elif not (result.get("title") or result.get("heading")):
|
||||
error_msg = "AI response missing title/heading"
|
||||
elif not (result.get("sections") or result.get("content")):
|
||||
error_msg = "AI response missing sections/content"
|
||||
else:
|
||||
error_msg = "AI response has invalid structure"
|
||||
|
||||
self.task_manager.update_task_status(task_id, "failed", f"Rewrite failed: {error_msg}")
|
||||
logger.error(f"Blog rewrite failed: {error_msg}")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Blog rewrite error: {str(e)}"
|
||||
self.task_manager.update_task_status(task_id, "failed", error_msg)
|
||||
logger.error(f"Blog rewrite task failed: {e}")
|
||||
raise
|
||||
152
backend/services/blog_writer/content/context_memory.py
Normal file
152
backend/services/blog_writer/content/context_memory.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""
|
||||
ContextMemory - maintains intelligent continuity context across sections using LLM-enhanced summarization.
|
||||
|
||||
Stores smart per-section summaries and thread keywords for use in prompts with cost optimization.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from collections import deque
|
||||
from loguru import logger
|
||||
import hashlib
|
||||
|
||||
# Import the common gemini provider
|
||||
from services.llm_providers.gemini_provider import gemini_text_response
|
||||
|
||||
|
||||
class ContextMemory:
|
||||
"""In-memory continuity store for recent sections with LLM-enhanced summarization.
|
||||
|
||||
Notes:
|
||||
- Keeps an ordered deque of recent (section_id, summary) pairs
|
||||
- Uses LLM for intelligent summarization when content is substantial
|
||||
- Provides utilities to build a compact previous-sections summary
|
||||
- Implements caching to minimize LLM calls
|
||||
"""
|
||||
|
||||
def __init__(self, max_entries: int = 10):
|
||||
self.max_entries = max_entries
|
||||
self._recent: deque[Tuple[str, str]] = deque(maxlen=max_entries)
|
||||
# Cache for LLM-generated summaries
|
||||
self._summary_cache: Dict[str, str] = {}
|
||||
logger.info("✅ ContextMemory initialized with LLM-enhanced summarization")
|
||||
|
||||
def update_with_section(self, section_id: str, full_text: str, use_llm: bool = True) -> None:
|
||||
"""Create a compact summary and store it for continuity usage."""
|
||||
summary = self._summarize_text_intelligently(full_text, use_llm=use_llm)
|
||||
self._recent.append((section_id, summary))
|
||||
|
||||
def get_recent_summaries(self, limit: int = 2) -> List[str]:
|
||||
"""Return the last N stored summaries (most recent first)."""
|
||||
return [s for (_sid, s) in list(self._recent)[-limit:]]
|
||||
|
||||
def build_previous_sections_summary(self, limit: int = 2) -> str:
|
||||
"""Join recent summaries for prompt injection."""
|
||||
recents = self.get_recent_summaries(limit=limit)
|
||||
if not recents:
|
||||
return ""
|
||||
return "\n\n".join(recents)
|
||||
|
||||
def _summarize_text_intelligently(self, text: str, target_words: int = 80, use_llm: bool = True) -> str:
|
||||
"""Create intelligent summary using LLM when appropriate, fallback to truncation."""
|
||||
|
||||
# Create cache key
|
||||
cache_key = self._get_cache_key(text)
|
||||
|
||||
# Check cache first
|
||||
if cache_key in self._summary_cache:
|
||||
logger.debug("Summary cache hit")
|
||||
return self._summary_cache[cache_key]
|
||||
|
||||
# Determine if we should use LLM
|
||||
should_use_llm = use_llm and self._should_use_llm_summarization(text)
|
||||
|
||||
if should_use_llm:
|
||||
try:
|
||||
summary = self._llm_summarize_text(text, target_words)
|
||||
self._summary_cache[cache_key] = summary
|
||||
logger.info("LLM-based summarization completed")
|
||||
return summary
|
||||
except Exception as e:
|
||||
logger.warning(f"LLM summarization failed, using fallback: {e}")
|
||||
# Fall through to local summarization
|
||||
|
||||
# Local fallback
|
||||
summary = self._summarize_text_locally(text, target_words)
|
||||
self._summary_cache[cache_key] = summary
|
||||
return summary
|
||||
|
||||
def _should_use_llm_summarization(self, text: str) -> bool:
|
||||
"""Determine if content is substantial enough to warrant LLM summarization."""
|
||||
word_count = len(text.split())
|
||||
# Use LLM for substantial content (>150 words) or complex structure
|
||||
has_complex_structure = any(marker in text for marker in ['##', '###', '**', '*', '-', '1.', '2.'])
|
||||
|
||||
return word_count > 150 or has_complex_structure
|
||||
|
||||
def _llm_summarize_text(self, text: str, target_words: int = 80) -> str:
|
||||
"""Use Gemini API for intelligent text summarization."""
|
||||
|
||||
# Truncate text to minimize tokens while keeping key content
|
||||
truncated_text = text[:800] # First 800 chars usually contain the main points
|
||||
|
||||
prompt = f"""
|
||||
Summarize the following content in approximately {target_words} words, focusing on key concepts and main points.
|
||||
|
||||
Content: {truncated_text}
|
||||
|
||||
Requirements:
|
||||
- Capture the main ideas and key concepts
|
||||
- Maintain the original tone and style
|
||||
- Keep it concise but informative
|
||||
- Focus on what's most important for continuity
|
||||
|
||||
Generate only the summary, no explanations or formatting.
|
||||
"""
|
||||
|
||||
try:
|
||||
result = gemini_text_response(
|
||||
prompt=prompt,
|
||||
temperature=0.3, # Low temperature for consistent summarization
|
||||
max_tokens=500, # Increased tokens for better summaries
|
||||
system_prompt="You are an expert at creating concise, informative summaries."
|
||||
)
|
||||
|
||||
if result and result.strip():
|
||||
summary = result.strip()
|
||||
# Ensure it's not too long
|
||||
words = summary.split()
|
||||
if len(words) > target_words + 20: # Allow some flexibility
|
||||
summary = " ".join(words[:target_words]) + "..."
|
||||
return summary
|
||||
else:
|
||||
logger.warning("LLM summary response empty, using fallback")
|
||||
return self._summarize_text_locally(text, target_words)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM summarization error: {e}")
|
||||
return self._summarize_text_locally(text, target_words)
|
||||
|
||||
def _summarize_text_locally(self, text: str, target_words: int = 80) -> str:
|
||||
"""Very lightweight, deterministic truncation-based summary.
|
||||
|
||||
This deliberately avoids extra LLM calls. It collects the first
|
||||
sentences up to approximately target_words.
|
||||
"""
|
||||
words = text.split()
|
||||
if len(words) <= target_words:
|
||||
return text.strip()
|
||||
return " ".join(words[:target_words]).strip() + " …"
|
||||
|
||||
def _get_cache_key(self, text: str) -> str:
|
||||
"""Generate cache key from text hash."""
|
||||
# Use first 200 chars for cache key to balance uniqueness vs memory
|
||||
return hashlib.md5(text[:200].encode()).hexdigest()[:12]
|
||||
|
||||
def clear_cache(self):
|
||||
"""Clear summary cache (useful for testing or memory management)."""
|
||||
self._summary_cache.clear()
|
||||
logger.info("ContextMemory cache cleared")
|
||||
|
||||
|
||||
@@ -0,0 +1,92 @@
|
||||
"""
|
||||
EnhancedContentGenerator - thin orchestrator for section generation.
|
||||
|
||||
Provider parity:
|
||||
- Uses main_text_generation.llm_text_gen to respect GPT_PROVIDER (Gemini/HF)
|
||||
- No direct provider coupling here; Google grounding remains in research only
|
||||
"""
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
from .source_url_manager import SourceURLManager
|
||||
from .context_memory import ContextMemory
|
||||
from .transition_generator import TransitionGenerator
|
||||
from .flow_analyzer import FlowAnalyzer
|
||||
|
||||
|
||||
class EnhancedContentGenerator:
|
||||
def __init__(self):
|
||||
self.url_manager = SourceURLManager()
|
||||
self.memory = ContextMemory(max_entries=12)
|
||||
self.transitioner = TransitionGenerator()
|
||||
self.flow = FlowAnalyzer()
|
||||
|
||||
async def generate_section(self, section: Any, research: Any, mode: str = "polished") -> Dict[str, Any]:
|
||||
prev_summary = self.memory.build_previous_sections_summary(limit=2)
|
||||
urls = self.url_manager.pick_relevant_urls(section, research)
|
||||
prompt = self._build_prompt(section, research, prev_summary, urls)
|
||||
# Provider-agnostic text generation (respect GPT_PROVIDER & circuit-breaker)
|
||||
content_text: str = ""
|
||||
try:
|
||||
ai_resp = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=None,
|
||||
system_prompt=None,
|
||||
)
|
||||
if isinstance(ai_resp, dict) and ai_resp.get("text"):
|
||||
content_text = ai_resp.get("text", "")
|
||||
elif isinstance(ai_resp, str):
|
||||
content_text = ai_resp
|
||||
else:
|
||||
# Fallback best-effort extraction
|
||||
content_text = str(ai_resp or "")
|
||||
except Exception as e:
|
||||
content_text = ""
|
||||
|
||||
result = {
|
||||
"content": content_text,
|
||||
"sources": [{"title": u.get("title", ""), "url": u.get("url", "")} for u in urls] if urls else [],
|
||||
}
|
||||
# Generate transition and compute intelligent flow metrics
|
||||
previous_text = prev_summary
|
||||
current_text = result.get("content", "")
|
||||
transition = self.transitioner.generate_transition(previous_text, getattr(section, 'heading', 'This section'), use_llm=True)
|
||||
metrics = self.flow.assess_flow(previous_text, current_text, use_llm=True)
|
||||
|
||||
# Update memory for subsequent sections and store continuity snapshot
|
||||
if current_text:
|
||||
self.memory.update_with_section(getattr(section, 'id', 'unknown'), current_text, use_llm=True)
|
||||
|
||||
# Return enriched result
|
||||
result["transition"] = transition
|
||||
result["continuity_metrics"] = metrics
|
||||
# Persist a lightweight continuity snapshot for API access
|
||||
try:
|
||||
sid = getattr(section, 'id', 'unknown')
|
||||
if not hasattr(self, "_last_continuity"):
|
||||
self._last_continuity = {}
|
||||
self._last_continuity[sid] = metrics
|
||||
except Exception:
|
||||
pass
|
||||
return result
|
||||
|
||||
def _build_prompt(self, section: Any, research: Any, prev_summary: str, urls: list) -> str:
|
||||
heading = getattr(section, 'heading', 'Section')
|
||||
key_points = getattr(section, 'key_points', [])
|
||||
keywords = getattr(section, 'keywords', [])
|
||||
target_words = getattr(section, 'target_words', 300)
|
||||
url_block = "\n".join([f"- {u.get('title','')} ({u.get('url','')})" for u in urls]) if urls else "(no specific URLs provided)"
|
||||
|
||||
return (
|
||||
f"You are writing the blog section '{heading}'.\n\n"
|
||||
f"Context summary (previous sections): {prev_summary}\n\n"
|
||||
f"Authoring requirements:\n"
|
||||
f"- Target word count: ~{target_words}\n"
|
||||
f"- Use the following key points: {', '.join(key_points)}\n"
|
||||
f"- Include these keywords naturally: {', '.join(keywords)}\n"
|
||||
f"- Cite insights from these sources when relevant (do not output raw URLs):\n{url_block}\n\n"
|
||||
"Write engaging, well-structured markdown with clear paragraphs (2-4 sentences each) separated by double line breaks."
|
||||
)
|
||||
|
||||
|
||||
162
backend/services/blog_writer/content/flow_analyzer.py
Normal file
162
backend/services/blog_writer/content/flow_analyzer.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""
|
||||
FlowAnalyzer - evaluates narrative flow using LLM-based analysis with cost optimization.
|
||||
|
||||
Uses Gemini API for intelligent analysis while minimizing API calls through caching and smart triggers.
|
||||
"""
|
||||
|
||||
from typing import Dict, Optional
|
||||
from loguru import logger
|
||||
import hashlib
|
||||
import json
|
||||
|
||||
# Import the common gemini provider
|
||||
from services.llm_providers.gemini_provider import gemini_structured_json_response
|
||||
|
||||
|
||||
class FlowAnalyzer:
|
||||
def __init__(self):
|
||||
# Simple in-memory cache to avoid redundant LLM calls
|
||||
self._cache: Dict[str, Dict[str, float]] = {}
|
||||
# Cache for rule-based fallback when LLM analysis isn't needed
|
||||
self._rule_cache: Dict[str, Dict[str, float]] = {}
|
||||
logger.info("✅ FlowAnalyzer initialized with LLM-based analysis")
|
||||
|
||||
def assess_flow(self, previous_text: str, current_text: str, use_llm: bool = True) -> Dict[str, float]:
|
||||
"""
|
||||
Return flow metrics in range 0..1.
|
||||
|
||||
Args:
|
||||
previous_text: Previous section content
|
||||
current_text: Current section content
|
||||
use_llm: Whether to use LLM analysis (default: True for significant content)
|
||||
"""
|
||||
if not current_text:
|
||||
return {"flow": 0.0, "consistency": 0.0, "progression": 0.0}
|
||||
|
||||
# Create cache key from content hashes
|
||||
cache_key = self._get_cache_key(previous_text, current_text)
|
||||
|
||||
# Check cache first
|
||||
if cache_key in self._cache:
|
||||
logger.debug("Flow analysis cache hit")
|
||||
return self._cache[cache_key]
|
||||
|
||||
# Determine if we should use LLM analysis
|
||||
should_use_llm = use_llm and self._should_use_llm_analysis(previous_text, current_text)
|
||||
|
||||
if should_use_llm:
|
||||
try:
|
||||
metrics = self._llm_flow_analysis(previous_text, current_text)
|
||||
self._cache[cache_key] = metrics
|
||||
logger.info("LLM-based flow analysis completed")
|
||||
return metrics
|
||||
except Exception as e:
|
||||
logger.warning(f"LLM flow analysis failed, falling back to rules: {e}")
|
||||
# Fall through to rule-based analysis
|
||||
|
||||
# Rule-based fallback (cached separately)
|
||||
if cache_key in self._rule_cache:
|
||||
return self._rule_cache[cache_key]
|
||||
|
||||
metrics = self._rule_based_analysis(previous_text, current_text)
|
||||
self._rule_cache[cache_key] = metrics
|
||||
return metrics
|
||||
|
||||
def _should_use_llm_analysis(self, previous_text: str, current_text: str) -> bool:
|
||||
"""Determine if content is significant enough to warrant LLM analysis."""
|
||||
# Use LLM for substantial content or when previous context exists
|
||||
word_count = len(current_text.split())
|
||||
has_previous = bool(previous_text and len(previous_text.strip()) > 50)
|
||||
|
||||
# Use LLM if: substantial content (>100 words) OR has meaningful previous context
|
||||
return word_count > 100 or has_previous
|
||||
|
||||
def _llm_flow_analysis(self, previous_text: str, current_text: str) -> Dict[str, float]:
|
||||
"""Use Gemini API for intelligent flow analysis."""
|
||||
|
||||
# Truncate content to minimize tokens while keeping context
|
||||
prev_truncated = (previous_text[-300:] if previous_text else "") if previous_text else ""
|
||||
curr_truncated = current_text[:500] # First 500 chars usually contain the key content
|
||||
|
||||
prompt = f"""
|
||||
Analyze the narrative flow between these two content sections. Rate each aspect from 0.0 to 1.0.
|
||||
|
||||
PREVIOUS SECTION (end): {prev_truncated}
|
||||
CURRENT SECTION (start): {curr_truncated}
|
||||
|
||||
Evaluate:
|
||||
1. Flow Quality (0.0-1.0): How smoothly does the content transition? Are there logical connections?
|
||||
2. Consistency (0.0-1.0): Do key themes, terminology, and tone remain consistent?
|
||||
3. Progression (0.0-1.0): Does the content logically build upon previous ideas?
|
||||
|
||||
Return ONLY a JSON object with these exact keys: flow, consistency, progression
|
||||
"""
|
||||
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"flow": {"type": "number", "minimum": 0.0, "maximum": 1.0},
|
||||
"consistency": {"type": "number", "minimum": 0.0, "maximum": 1.0},
|
||||
"progression": {"type": "number", "minimum": 0.0, "maximum": 1.0}
|
||||
},
|
||||
"required": ["flow", "consistency", "progression"]
|
||||
}
|
||||
|
||||
try:
|
||||
result = gemini_structured_json_response(
|
||||
prompt=prompt,
|
||||
schema=schema,
|
||||
temperature=0.2, # Low temperature for consistent scoring
|
||||
max_tokens=1000 # Increased tokens for better analysis
|
||||
)
|
||||
|
||||
if result.parsed:
|
||||
return {
|
||||
"flow": float(result.parsed.get("flow", 0.6)),
|
||||
"consistency": float(result.parsed.get("consistency", 0.6)),
|
||||
"progression": float(result.parsed.get("progression", 0.6))
|
||||
}
|
||||
else:
|
||||
logger.warning("LLM response parsing failed, using fallback")
|
||||
return self._rule_based_analysis(previous_text, current_text)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM flow analysis error: {e}")
|
||||
return self._rule_based_analysis(previous_text, current_text)
|
||||
|
||||
def _rule_based_analysis(self, previous_text: str, current_text: str) -> Dict[str, float]:
|
||||
"""Fallback rule-based analysis for cost efficiency."""
|
||||
flow = 0.6
|
||||
consistency = 0.6
|
||||
progression = 0.6
|
||||
|
||||
# Enhanced heuristics
|
||||
if previous_text and previous_text[-1] in ".!?":
|
||||
flow += 0.1
|
||||
if any(k in current_text.lower() for k in ["therefore", "next", "building on", "as a result", "furthermore", "additionally"]):
|
||||
progression += 0.2
|
||||
if len(current_text.split()) > 120:
|
||||
consistency += 0.1
|
||||
if any(k in current_text.lower() for k in ["however", "but", "although", "despite"]):
|
||||
flow += 0.1 # Good use of contrast words
|
||||
|
||||
return {
|
||||
"flow": min(flow, 1.0),
|
||||
"consistency": min(consistency, 1.0),
|
||||
"progression": min(progression, 1.0),
|
||||
}
|
||||
|
||||
def _get_cache_key(self, previous_text: str, current_text: str) -> str:
|
||||
"""Generate cache key from content hashes."""
|
||||
# Use first 100 chars of each for cache key to balance uniqueness vs memory
|
||||
prev_hash = hashlib.md5((previous_text[:100] if previous_text else "").encode()).hexdigest()[:8]
|
||||
curr_hash = hashlib.md5(current_text[:100].encode()).hexdigest()[:8]
|
||||
return f"{prev_hash}_{curr_hash}"
|
||||
|
||||
def clear_cache(self):
|
||||
"""Clear analysis cache (useful for testing or memory management)."""
|
||||
self._cache.clear()
|
||||
self._rule_cache.clear()
|
||||
logger.info("FlowAnalyzer cache cleared")
|
||||
|
||||
|
||||
186
backend/services/blog_writer/content/introduction_generator.py
Normal file
186
backend/services/blog_writer/content/introduction_generator.py
Normal file
@@ -0,0 +1,186 @@
|
||||
"""
|
||||
Introduction Generator - Generates varied blog introductions based on content and research.
|
||||
|
||||
Generates 3 different introduction options for the user to choose from.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List
|
||||
from loguru import logger
|
||||
|
||||
from models.blog_models import BlogResearchResponse, BlogOutlineSection
|
||||
|
||||
|
||||
class IntroductionGenerator:
|
||||
"""Generates blog introductions using research and content data."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the introduction generator."""
|
||||
pass
|
||||
|
||||
def build_introduction_prompt(
|
||||
self,
|
||||
blog_title: str,
|
||||
research: BlogResearchResponse,
|
||||
outline: List[BlogOutlineSection],
|
||||
sections_content: Dict[str, str],
|
||||
primary_keywords: List[str],
|
||||
search_intent: str
|
||||
) -> str:
|
||||
"""Build a prompt for generating blog introductions."""
|
||||
|
||||
# Extract key research insights
|
||||
keyword_analysis = research.keyword_analysis or {}
|
||||
content_angles = research.suggested_angles or []
|
||||
|
||||
# Get a summary of the first few sections for context
|
||||
section_summaries = []
|
||||
for i, section in enumerate(outline[:3], 1):
|
||||
section_id = section.id
|
||||
content = sections_content.get(section_id, '')
|
||||
if content:
|
||||
# Take first 200 chars as summary
|
||||
summary = content[:200] + '...' if len(content) > 200 else content
|
||||
section_summaries.append(f"{i}. {section.heading}: {summary}")
|
||||
|
||||
sections_text = '\n'.join(section_summaries) if section_summaries else "Content sections are being generated."
|
||||
|
||||
primary_kw_text = ', '.join(primary_keywords) if primary_keywords else "the topic"
|
||||
content_angle_text = ', '.join(content_angles[:3]) if content_angles else "General insights"
|
||||
|
||||
return f"""Generate exactly 3 varied blog introductions for the following blog post.
|
||||
|
||||
BLOG TITLE: {blog_title}
|
||||
|
||||
PRIMARY KEYWORDS: {primary_kw_text}
|
||||
SEARCH INTENT: {search_intent}
|
||||
CONTENT ANGLES: {content_angle_text}
|
||||
|
||||
BLOG CONTENT SUMMARY:
|
||||
{sections_text}
|
||||
|
||||
REQUIREMENTS FOR EACH INTRODUCTION:
|
||||
- 80-120 words in length
|
||||
- Hook the reader immediately with a compelling opening
|
||||
- Clearly state the value proposition and what readers will learn
|
||||
- Include the primary keyword naturally within the first 2 sentences
|
||||
- Each introduction should have a different angle/approach:
|
||||
1. First: Problem-focused (highlight the challenge readers face)
|
||||
2. Second: Benefit-focused (emphasize the value and outcomes)
|
||||
3. Third: Story/statistic-focused (use a compelling fact or narrative hook)
|
||||
- Maintain a professional yet engaging tone
|
||||
- Avoid generic phrases - be specific and benefit-driven
|
||||
|
||||
Return ONLY a JSON array of exactly 3 introductions:
|
||||
[
|
||||
"First introduction (80-120 words, problem-focused)",
|
||||
"Second introduction (80-120 words, benefit-focused)",
|
||||
"Third introduction (80-120 words, story/statistic-focused)"
|
||||
]"""
|
||||
|
||||
def get_introduction_schema(self) -> Dict[str, Any]:
|
||||
"""Get the JSON schema for introduction generation."""
|
||||
return {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"minLength": 80,
|
||||
"maxLength": 150
|
||||
},
|
||||
"minItems": 3,
|
||||
"maxItems": 3
|
||||
}
|
||||
|
||||
async def generate_introductions(
|
||||
self,
|
||||
blog_title: str,
|
||||
research: BlogResearchResponse,
|
||||
outline: List[BlogOutlineSection],
|
||||
sections_content: Dict[str, str],
|
||||
primary_keywords: List[str],
|
||||
search_intent: str,
|
||||
user_id: str
|
||||
) -> List[str]:
|
||||
"""Generate 3 varied blog introductions.
|
||||
|
||||
Args:
|
||||
blog_title: The blog post title
|
||||
research: Research data with keywords and insights
|
||||
outline: Blog outline sections
|
||||
sections_content: Dictionary mapping section IDs to their content
|
||||
primary_keywords: Primary keywords for the blog
|
||||
search_intent: Search intent (informational, commercial, etc.)
|
||||
user_id: User ID for API calls
|
||||
|
||||
Returns:
|
||||
List of 3 introduction options
|
||||
"""
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for introduction generation")
|
||||
|
||||
# Build prompt
|
||||
prompt = self.build_introduction_prompt(
|
||||
blog_title=blog_title,
|
||||
research=research,
|
||||
outline=outline,
|
||||
sections_content=sections_content,
|
||||
primary_keywords=primary_keywords,
|
||||
search_intent=search_intent
|
||||
)
|
||||
|
||||
# Get schema
|
||||
schema = self.get_introduction_schema()
|
||||
|
||||
logger.info(f"Generating blog introductions for user {user_id}")
|
||||
|
||||
try:
|
||||
# Generate introductions using structured JSON response
|
||||
result = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=schema,
|
||||
system_prompt="You are an expert content writer specializing in creating compelling blog introductions that hook readers and clearly communicate value.",
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
# Handle response - could be array directly or wrapped in dict
|
||||
if isinstance(result, list):
|
||||
introductions = result
|
||||
elif isinstance(result, dict):
|
||||
# Try common keys
|
||||
introductions = result.get('introductions', result.get('options', result.get('intros', [])))
|
||||
if not introductions and isinstance(result.get('response'), list):
|
||||
introductions = result['response']
|
||||
else:
|
||||
logger.warning(f"Unexpected introduction generation result type: {type(result)}")
|
||||
introductions = []
|
||||
|
||||
# Validate and clean introductions
|
||||
cleaned_introductions = []
|
||||
for intro in introductions:
|
||||
if isinstance(intro, str) and len(intro.strip()) >= 50: # Minimum reasonable length
|
||||
cleaned = intro.strip()
|
||||
# Ensure it's within reasonable bounds
|
||||
if len(cleaned) <= 200: # Allow slight overflow for quality
|
||||
cleaned_introductions.append(cleaned)
|
||||
|
||||
# Ensure we have exactly 3 introductions
|
||||
if len(cleaned_introductions) < 3:
|
||||
logger.warning(f"Generated only {len(cleaned_introductions)} introductions, expected 3")
|
||||
# Pad with placeholder if needed
|
||||
while len(cleaned_introductions) < 3:
|
||||
cleaned_introductions.append(f"{blog_title} - A comprehensive guide covering essential insights and practical strategies.")
|
||||
|
||||
# Return exactly 3 introductions
|
||||
return cleaned_introductions[:3]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate introductions: {e}")
|
||||
# Fallback: generate simple introductions
|
||||
fallback_introductions = [
|
||||
f"In this comprehensive guide, we'll explore {primary_keywords[0] if primary_keywords else 'essential insights'} and provide actionable strategies.",
|
||||
f"Discover everything you need to know about {primary_keywords[0] if primary_keywords else 'this topic'} and how it can transform your approach.",
|
||||
f"Whether you're new to {primary_keywords[0] if primary_keywords else 'this topic'} or looking to deepen your understanding, this guide has you covered."
|
||||
]
|
||||
return fallback_introductions
|
||||
|
||||
257
backend/services/blog_writer/content/medium_blog_generator.py
Normal file
257
backend/services/blog_writer/content/medium_blog_generator.py
Normal file
@@ -0,0 +1,257 @@
|
||||
"""
|
||||
Medium Blog Generator Service
|
||||
|
||||
Handles generation of medium-length blogs (≤1000 words) using structured AI calls.
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
from typing import Dict, Any, List
|
||||
from loguru import logger
|
||||
from fastapi import HTTPException
|
||||
|
||||
from models.blog_models import (
|
||||
MediumBlogGenerateRequest,
|
||||
MediumBlogGenerateResult,
|
||||
MediumGeneratedSection,
|
||||
ResearchSource,
|
||||
)
|
||||
from services.llm_providers.main_text_generation import llm_text_gen
|
||||
from services.cache.persistent_content_cache import persistent_content_cache
|
||||
|
||||
|
||||
class MediumBlogGenerator:
|
||||
"""Service for generating medium-length blog content using structured AI calls."""
|
||||
|
||||
def __init__(self):
|
||||
self.cache = persistent_content_cache
|
||||
|
||||
async def generate_medium_blog_with_progress(self, req: MediumBlogGenerateRequest, task_id: str, user_id: str) -> MediumBlogGenerateResult:
|
||||
"""Use Gemini structured JSON to generate a medium-length blog in one call.
|
||||
|
||||
Args:
|
||||
req: Medium blog generation request
|
||||
task_id: Task ID for progress updates
|
||||
user_id: User ID (required for subscription checks and usage tracking)
|
||||
|
||||
Raises:
|
||||
ValueError: If user_id is not provided
|
||||
"""
|
||||
if not user_id:
|
||||
raise ValueError("user_id is required for medium blog generation (subscription checks and usage tracking)")
|
||||
|
||||
import time
|
||||
start = time.time()
|
||||
|
||||
# Prepare sections data for cache key generation
|
||||
sections_for_cache = []
|
||||
for s in req.sections:
|
||||
sections_for_cache.append({
|
||||
"id": s.id,
|
||||
"heading": s.heading,
|
||||
"keyPoints": getattr(s, "key_points", []) or getattr(s, "keyPoints", []),
|
||||
"subheadings": getattr(s, "subheadings", []),
|
||||
"keywords": getattr(s, "keywords", []),
|
||||
"targetWords": getattr(s, "target_words", None) or getattr(s, "targetWords", None),
|
||||
})
|
||||
|
||||
# Check cache first
|
||||
cached_result = self.cache.get_cached_content(
|
||||
keywords=req.researchKeywords or [],
|
||||
sections=sections_for_cache,
|
||||
global_target_words=req.globalTargetWords or 1000,
|
||||
persona_data=req.persona.dict() if req.persona else None,
|
||||
tone=req.tone,
|
||||
audience=req.audience
|
||||
)
|
||||
|
||||
if cached_result:
|
||||
logger.info(f"Using cached content for keywords: {req.researchKeywords} (saved expensive generation)")
|
||||
# Add cache hit marker to distinguish from fresh generation
|
||||
cached_result['generation_time_ms'] = 0 # Mark as cache hit
|
||||
cached_result['cache_hit'] = True
|
||||
return MediumBlogGenerateResult(**cached_result)
|
||||
|
||||
# Cache miss - proceed with AI generation
|
||||
logger.info(f"Cache miss - generating new content for keywords: {req.researchKeywords}")
|
||||
|
||||
# Build schema expected from the model
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"sections": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {"type": "string"},
|
||||
"heading": {"type": "string"},
|
||||
"content": {"type": "string"},
|
||||
"wordCount": {"type": "number"},
|
||||
"sources": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {"title": {"type": "string"}, "url": {"type": "string"}},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
# Compose prompt
|
||||
def section_block(s):
|
||||
return {
|
||||
"id": s.id,
|
||||
"heading": s.heading,
|
||||
"outline": {
|
||||
"keyPoints": getattr(s, "key_points", []) or getattr(s, "keyPoints", []),
|
||||
"subheadings": getattr(s, "subheadings", []),
|
||||
"keywords": getattr(s, "keywords", []),
|
||||
"targetWords": getattr(s, "target_words", None) or getattr(s, "targetWords", None),
|
||||
"references": [
|
||||
{"title": r.title, "url": r.url} for r in getattr(s, "references", [])
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
payload = {
|
||||
"title": req.title,
|
||||
"globalTargetWords": req.globalTargetWords or 1000,
|
||||
"persona": req.persona.dict() if req.persona else None,
|
||||
"tone": req.tone,
|
||||
"audience": req.audience,
|
||||
"sections": [section_block(s) for s in req.sections],
|
||||
}
|
||||
|
||||
# Build persona-aware system prompt
|
||||
persona_context = ""
|
||||
if req.persona:
|
||||
persona_context = f"""
|
||||
PERSONA GUIDELINES:
|
||||
- Industry: {req.persona.industry or 'General'}
|
||||
- Tone: {req.persona.tone or 'Professional'}
|
||||
- Audience: {req.persona.audience or 'General readers'}
|
||||
- Persona ID: {req.persona.persona_id or 'Default'}
|
||||
|
||||
Write content that reflects this persona's expertise and communication style.
|
||||
Use industry-specific terminology and examples where appropriate.
|
||||
Maintain consistent voice and authority throughout all sections.
|
||||
"""
|
||||
|
||||
system = (
|
||||
"You are a professional blog writer with deep expertise in your field. "
|
||||
"Generate high-quality, persona-driven content for each section based on the provided outline. "
|
||||
"Write engaging, informative content that follows the section's key points and target word count. "
|
||||
"Ensure the content flows naturally and maintains consistent voice and authority. "
|
||||
"Format content with proper paragraph breaks using double line breaks (\\n\\n) between paragraphs. "
|
||||
"Structure content with clear paragraphs - aim for 2-4 sentences per paragraph. "
|
||||
f"{persona_context}"
|
||||
"Return ONLY valid JSON with no markdown formatting or explanations."
|
||||
)
|
||||
|
||||
# Build persona-specific content instructions
|
||||
persona_instructions = ""
|
||||
if req.persona:
|
||||
industry = req.persona.industry or 'General'
|
||||
tone = req.persona.tone or 'Professional'
|
||||
audience = req.persona.audience or 'General readers'
|
||||
|
||||
persona_instructions = f"""
|
||||
PERSONA-DRIVEN CONTENT REQUIREMENTS:
|
||||
- Write as an expert in {industry} industry
|
||||
- Use {tone} tone appropriate for {audience}
|
||||
- Include industry-specific examples and terminology
|
||||
- Demonstrate authority and expertise in the field
|
||||
- Use language that resonates with {audience}
|
||||
- Maintain consistent voice that reflects this persona's expertise
|
||||
"""
|
||||
|
||||
prompt = (
|
||||
f"Write blog content for the following sections. Each section should be {req.globalTargetWords or 1000} words total, distributed across all sections.\n\n"
|
||||
f"Blog Title: {req.title}\n\n"
|
||||
"For each section, write engaging content that:\n"
|
||||
"- Follows the key points provided\n"
|
||||
"- Uses the suggested keywords naturally\n"
|
||||
"- Meets the target word count\n"
|
||||
"- Maintains professional tone\n"
|
||||
"- References the provided sources when relevant\n"
|
||||
"- Breaks content into clear paragraphs (2-4 sentences each)\n"
|
||||
"- Uses double line breaks (\\n\\n) between paragraphs for proper formatting\n"
|
||||
"- Starts with an engaging opening paragraph\n"
|
||||
"- Ends with a strong concluding paragraph\n"
|
||||
f"{persona_instructions}\n"
|
||||
"IMPORTANT: Format the 'content' field with proper paragraph breaks using \\n\\n between paragraphs.\n\n"
|
||||
"Return a JSON object with 'title' and 'sections' array. Each section should have 'id', 'heading', 'content', and 'wordCount'.\n\n"
|
||||
f"Sections to write:\n{json.dumps(payload, ensure_ascii=False, indent=2)}"
|
||||
)
|
||||
|
||||
try:
|
||||
ai_resp = llm_text_gen(
|
||||
prompt=prompt,
|
||||
json_struct=schema,
|
||||
system_prompt=system,
|
||||
user_id=user_id
|
||||
)
|
||||
except HTTPException:
|
||||
# Re-raise HTTPExceptions (e.g., 429 subscription limit) to preserve error details
|
||||
raise
|
||||
except Exception as llm_error:
|
||||
# Wrap other errors
|
||||
logger.error(f"AI generation failed: {llm_error}")
|
||||
raise Exception(f"AI generation failed: {str(llm_error)}")
|
||||
|
||||
# Check for errors in AI response
|
||||
if not ai_resp or ai_resp.get("error"):
|
||||
error_msg = ai_resp.get("error", "Empty generation result from model") if ai_resp else "No response from model"
|
||||
logger.error(f"AI generation failed: {error_msg}")
|
||||
raise Exception(f"AI generation failed: {error_msg}")
|
||||
|
||||
# Normalize output
|
||||
title = ai_resp.get("title") or req.title
|
||||
out_sections = []
|
||||
for s in ai_resp.get("sections", []) or []:
|
||||
out_sections.append(
|
||||
MediumGeneratedSection(
|
||||
id=str(s.get("id")),
|
||||
heading=s.get("heading") or "",
|
||||
content=s.get("content") or "",
|
||||
wordCount=int(s.get("wordCount") or 0),
|
||||
sources=[
|
||||
# map to ResearchSource shape if possible; keep minimal
|
||||
ResearchSource(title=src.get("title", ""), url=src.get("url", ""))
|
||||
for src in (s.get("sources") or [])
|
||||
] or None,
|
||||
)
|
||||
)
|
||||
|
||||
duration_ms = int((time.time() - start) * 1000)
|
||||
result = MediumBlogGenerateResult(
|
||||
success=True,
|
||||
title=title,
|
||||
sections=out_sections,
|
||||
model="gemini-2.5-flash",
|
||||
generation_time_ms=duration_ms,
|
||||
safety_flags=None,
|
||||
)
|
||||
|
||||
# Cache the result for future use
|
||||
try:
|
||||
self.cache.cache_content(
|
||||
keywords=req.researchKeywords or [],
|
||||
sections=sections_for_cache,
|
||||
global_target_words=req.globalTargetWords or 1000,
|
||||
persona_data=req.persona.dict() if req.persona else None,
|
||||
tone=req.tone or "professional",
|
||||
audience=req.audience or "general",
|
||||
result=result.dict()
|
||||
)
|
||||
logger.info(f"Cached content result for keywords: {req.researchKeywords}")
|
||||
except Exception as cache_error:
|
||||
logger.warning(f"Failed to cache content result: {cache_error}")
|
||||
# Don't fail the entire operation if caching fails
|
||||
|
||||
return result
|
||||
42
backend/services/blog_writer/content/source_url_manager.py
Normal file
42
backend/services/blog_writer/content/source_url_manager.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
SourceURLManager - selects the most relevant source URLs for a section.
|
||||
|
||||
Low-effort heuristic using keywords and titles; safe defaults if no research.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any
|
||||
|
||||
|
||||
class SourceURLManager:
|
||||
def pick_relevant_urls(self, section: Any, research: Any, limit: int = 5) -> List[str]:
|
||||
if not research or not getattr(research, 'sources', None):
|
||||
return []
|
||||
|
||||
section_keywords = set([k.lower() for k in getattr(section, 'keywords', [])])
|
||||
scored: List[tuple[float, str]] = []
|
||||
for s in research.sources:
|
||||
url = getattr(s, 'url', None) or getattr(s, 'uri', None) or s.get('url') if isinstance(s, dict) else None
|
||||
title = getattr(s, 'title', None) or s.get('title') if isinstance(s, dict) else ''
|
||||
if not url or not isinstance(url, str):
|
||||
continue
|
||||
title_l = (title or '').lower()
|
||||
# simple overlap score
|
||||
score = 0.0
|
||||
for kw in section_keywords:
|
||||
if kw and kw in title_l:
|
||||
score += 1.0
|
||||
# prefer https and reputable domains lightly
|
||||
if url.startswith('https://'):
|
||||
score += 0.2
|
||||
scored.append((score, url))
|
||||
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
dedup: List[str] = []
|
||||
for _, u in scored:
|
||||
if u not in dedup:
|
||||
dedup.append(u)
|
||||
if len(dedup) >= limit:
|
||||
break
|
||||
return dedup
|
||||
|
||||
|
||||
143
backend/services/blog_writer/content/transition_generator.py
Normal file
143
backend/services/blog_writer/content/transition_generator.py
Normal file
@@ -0,0 +1,143 @@
|
||||
"""
|
||||
TransitionGenerator - produces intelligent transitions between sections using LLM analysis.
|
||||
|
||||
Uses Gemini API for natural transitions while maintaining cost efficiency through smart caching.
|
||||
"""
|
||||
|
||||
from typing import Optional, Dict
|
||||
from loguru import logger
|
||||
import hashlib
|
||||
|
||||
# Import the common gemini provider
|
||||
from services.llm_providers.gemini_provider import gemini_text_response
|
||||
|
||||
|
||||
class TransitionGenerator:
|
||||
def __init__(self):
|
||||
# Simple cache to avoid redundant LLM calls for similar transitions
|
||||
self._cache: Dict[str, str] = {}
|
||||
logger.info("✅ TransitionGenerator initialized with LLM-based generation")
|
||||
|
||||
def generate_transition(self, previous_text: str, current_heading: str, use_llm: bool = True) -> str:
|
||||
"""
|
||||
Return a 1–2 sentence bridge from previous_text into current_heading.
|
||||
|
||||
Args:
|
||||
previous_text: Previous section content
|
||||
current_heading: Current section heading
|
||||
use_llm: Whether to use LLM generation (default: True for substantial content)
|
||||
"""
|
||||
prev = (previous_text or "").strip()
|
||||
if not prev:
|
||||
return f"Let's explore {current_heading.lower()} next."
|
||||
|
||||
# Create cache key
|
||||
cache_key = self._get_cache_key(prev, current_heading)
|
||||
|
||||
# Check cache first
|
||||
if cache_key in self._cache:
|
||||
logger.debug("Transition generation cache hit")
|
||||
return self._cache[cache_key]
|
||||
|
||||
# Determine if we should use LLM
|
||||
should_use_llm = use_llm and self._should_use_llm_generation(prev, current_heading)
|
||||
|
||||
if should_use_llm:
|
||||
try:
|
||||
transition = self._llm_generate_transition(prev, current_heading)
|
||||
self._cache[cache_key] = transition
|
||||
logger.info("LLM-based transition generated")
|
||||
return transition
|
||||
except Exception as e:
|
||||
logger.warning(f"LLM transition generation failed, using fallback: {e}")
|
||||
# Fall through to heuristic generation
|
||||
|
||||
# Heuristic fallback
|
||||
transition = self._heuristic_transition(prev, current_heading)
|
||||
self._cache[cache_key] = transition
|
||||
return transition
|
||||
|
||||
def _should_use_llm_generation(self, previous_text: str, current_heading: str) -> bool:
|
||||
"""Determine if content is substantial enough to warrant LLM generation."""
|
||||
# Use LLM for substantial previous content (>100 words) or complex headings
|
||||
word_count = len(previous_text.split())
|
||||
complex_heading = len(current_heading.split()) > 2 or any(char in current_heading for char in [':', '-', '&'])
|
||||
|
||||
return word_count > 100 or complex_heading
|
||||
|
||||
def _llm_generate_transition(self, previous_text: str, current_heading: str) -> str:
|
||||
"""Use Gemini API for intelligent transition generation."""
|
||||
|
||||
# Truncate previous text to minimize tokens while keeping context
|
||||
prev_truncated = previous_text[-200:] # Last 200 chars usually contain the conclusion
|
||||
|
||||
prompt = f"""
|
||||
Create a smooth, natural 1-2 sentence transition from the previous content to the new section.
|
||||
|
||||
PREVIOUS CONTENT (ending): {prev_truncated}
|
||||
NEW SECTION HEADING: {current_heading}
|
||||
|
||||
Requirements:
|
||||
- Write exactly 1-2 sentences
|
||||
- Create a logical bridge between the topics
|
||||
- Use natural, engaging language
|
||||
- Avoid repetition of the previous content
|
||||
- Lead smoothly into the new section topic
|
||||
|
||||
Generate only the transition text, no explanations or formatting.
|
||||
"""
|
||||
|
||||
try:
|
||||
result = gemini_text_response(
|
||||
prompt=prompt,
|
||||
temperature=0.6, # Balanced creativity and consistency
|
||||
max_tokens=300, # Increased tokens for better transitions
|
||||
system_prompt="You are an expert content writer creating smooth transitions between sections."
|
||||
)
|
||||
|
||||
if result and result.strip():
|
||||
# Clean up the response
|
||||
transition = result.strip()
|
||||
# Ensure it's 1-2 sentences
|
||||
sentences = transition.split('. ')
|
||||
if len(sentences) > 2:
|
||||
transition = '. '.join(sentences[:2]) + '.'
|
||||
return transition
|
||||
else:
|
||||
logger.warning("LLM transition response empty, using fallback")
|
||||
return self._heuristic_transition(previous_text, current_heading)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM transition generation error: {e}")
|
||||
return self._heuristic_transition(previous_text, current_heading)
|
||||
|
||||
def _heuristic_transition(self, previous_text: str, current_heading: str) -> str:
|
||||
"""Fallback heuristic-based transition generation."""
|
||||
tail = previous_text[-240:]
|
||||
|
||||
# Enhanced heuristics based on content patterns
|
||||
if any(word in tail.lower() for word in ["problem", "issue", "challenge"]):
|
||||
return f"Now that we've identified the challenges, let's explore {current_heading.lower()} to find solutions."
|
||||
elif any(word in tail.lower() for word in ["solution", "approach", "method"]):
|
||||
return f"Building on this approach, {current_heading.lower()} provides the next step in our analysis."
|
||||
elif any(word in tail.lower() for word in ["important", "crucial", "essential"]):
|
||||
return f"Given this importance, {current_heading.lower()} becomes our next focus area."
|
||||
else:
|
||||
return (
|
||||
f"Building on the discussion above, this leads us into {current_heading.lower()}, "
|
||||
f"where we focus on practical implications and what to do next."
|
||||
)
|
||||
|
||||
def _get_cache_key(self, previous_text: str, current_heading: str) -> str:
|
||||
"""Generate cache key from content hashes."""
|
||||
# Use last 100 chars of previous text and heading for cache key
|
||||
prev_hash = hashlib.md5(previous_text[-100:].encode()).hexdigest()[:8]
|
||||
heading_hash = hashlib.md5(current_heading.encode()).hexdigest()[:8]
|
||||
return f"{prev_hash}_{heading_hash}"
|
||||
|
||||
def clear_cache(self):
|
||||
"""Clear transition cache (useful for testing or memory management)."""
|
||||
self._cache.clear()
|
||||
logger.info("TransitionGenerator cache cleared")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user