Files
ALwrity/ToBeMigrated/ai_writers/blog_rewriter_updater/blog_rewriter_utils.py
2025-08-06 16:29:49 +05:30

595 lines
25 KiB
Python

"""
Blog Rewriter Utilities Module
This module contains the core functionality for rewriting and updating blog content,
including content extraction, analysis, research, and rewriting capabilities.
"""
import requests
from bs4 import BeautifulSoup
import re
import time
import logging
from typing import Dict, List, Tuple, Optional, Any
import json
import os
from datetime import datetime
# Import required modules from the project
from ...gpt_providers.text_generation.main_text_generation import llm_text_gen
from ...gpt_providers.text_to_image_generation.main_generate_image_from_prompt import generate_image
from ...ai_web_researcher.metaphor_basic_neural_web_search import metaphor_search_articles
from ...ai_web_researcher.tavily_ai_search import do_tavily_ai_search
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Define constants
MAX_TITLE_LENGTH = 70
MAX_META_DESCRIPTION_LENGTH = 160
REWRITE_MODES = {
"standard": "Standard rewrite with improved clarity and flow",
"seo_optimization": "Optimize for search engines with targeted keywords",
"simplification": "Simplify complex content for broader audience",
"expansion": "Expand with additional details and examples",
"fact_check": "Focus on fact-checking and updating information",
"tone_shift": "Change the tone while preserving content",
"modernization": "Update outdated content with current information"
}
# Define tone options
TONE_OPTIONS = [
"Professional", "Conversational", "Academic", "Enthusiastic",
"Authoritative", "Friendly", "Technical", "Inspirational"
]
class BlogRewriter:
"""Class to handle blog rewriting functionality."""
def __init__(self):
"""Initialize the BlogRewriter class."""
self.original_content = {}
self.rewritten_content = {}
self.research_results = {}
self.content_analysis = {}
self.image_suggestions = []
def extract_content_from_url(self, url: str) -> Dict[str, Any]:
"""
Extract content from a given URL.
Args:
url: The URL to extract content from
Returns:
Dictionary containing extracted content
"""
logger.info(f"Extracting content from URL: {url}")
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0'
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract title
title = soup.title.string if soup.title else ""
# Extract meta description
meta_desc = ""
meta_tag = soup.find("meta", attrs={"name": "description"})
if meta_tag and "content" in meta_tag.attrs:
meta_desc = meta_tag["content"]
# Extract main content - try multiple strategies
content = ""
# Strategy 1: Look for article tag
article_tag = soup.find("article")
if article_tag:
content = article_tag.get_text(separator="\n\n")
# Strategy 2: Look for main content areas
if not content:
main_content = soup.find(["main", "div", "section"], class_=re.compile(r"content|article|post|entry|main|body"))
if main_content:
for elem in main_content.find_all(["nav", "aside", "footer", "comments", "script", "style", "header"]):
elem.decompose()
content = main_content.get_text(separator="\n\n")
# Strategy 3: Look for specific content classes
if not content:
content_classes = ["post-content", "entry-content", "article-content", "blog-content", "content-area"]
for class_name in content_classes:
content_div = soup.find("div", class_=class_name)
if content_div:
for elem in content_div.find_all(["nav", "aside", "footer", "comments", "script", "style", "header"]):
elem.decompose()
content = content_div.get_text(separator="\n\n")
break
# Strategy 4: Look for content within body
if not content:
body = soup.find("body")
if body:
# Remove unwanted elements
for elem in body.find_all(["nav", "aside", "footer", "comments", "script", "style", "header"]):
elem.decompose()
content = body.get_text(separator="\n\n")
# Clean up the content
content = re.sub(r'\n{3,}', '\n\n', content)
content = re.sub(r'\s{2,}', ' ', content)
content = content.strip()
# Extract headings with their hierarchy
headings = []
for h in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
headings.append({
"level": int(h.name[1]),
"text": h.get_text().strip()
})
# Extract images with more metadata
images = []
for img in soup.find_all("img"):
if img.get("src") and not img.get("src").startswith("data:"):
image_url = img.get("src")
if not image_url.startswith(("http://", "https://")):
base_url = "/".join(url.split("/")[:3])
image_url = f"{base_url}/{image_url.lstrip('/')}"
images.append({
"url": image_url,
"alt_text": img.get("alt", ""),
"title": img.get("title", ""),
"class": img.get("class", []),
"width": img.get("width"),
"height": img.get("height")
})
# Extract publish date with multiple strategies
publish_date = None
# Try meta tags first
date_meta = soup.find("meta", attrs={"property": "article:published_time"})
if date_meta and "content" in date_meta.attrs:
publish_date = date_meta["content"]
else:
# Try other meta tags
for prop in ["datePublished", "dateCreated", "dateModified"]:
date_meta = soup.find("meta", attrs={"property": prop})
if date_meta and "content" in date_meta.attrs:
publish_date = date_meta["content"]
break
# Try HTML elements if meta tags failed
if not publish_date:
date_elem = soup.find(["time", "span", "div"], class_=re.compile(r"date|time|publish|posted|created"))
if date_elem and date_elem.get_text():
publish_date = date_elem.get_text().strip()
# Extract author with multiple strategies
author = None
# Try meta tags first
author_meta = soup.find("meta", attrs={"name": "author"})
if author_meta and "content" in author_meta.attrs:
author = author_meta["content"]
else:
# Try other meta tags
for prop in ["article:author", "author"]:
author_meta = soup.find("meta", attrs={"property": prop})
if author_meta and "content" in author_meta.attrs:
author = author_meta["content"]
break
# Try HTML elements if meta tags failed
if not author:
author_elem = soup.find(["a", "span", "div"], class_=re.compile(r"author|byline|writer|posted-by"))
if author_elem and author_elem.get_text():
author = author_elem.get_text().strip()
# Log content extraction results
logger.info(f"Extracted content length: {len(content)} characters")
logger.info(f"Found {len(headings)} headings")
logger.info(f"Found {len(images)} images")
logger.info(f"Publish date: {publish_date}")
logger.info(f"Author: {author}")
return {
"title": title,
"meta_description": meta_desc,
"content": content,
"headings": headings,
"images": images,
"publish_date": publish_date,
"author": author,
"url": url
}
except Exception as e:
logger.error(f"Error extracting content from URL: {e}")
return {
"title": "",
"meta_description": "",
"content": "",
"headings": [],
"images": [],
"publish_date": None,
"author": None,
"url": url,
"error": str(e)
}
def analyze_content(self, content: Dict[str, Any]) -> Dict[str, Any]:
"""
Analyze the extracted content to provide insights.
Args:
content: Dictionary containing extracted content
Returns:
Dictionary containing content analysis
"""
logger.info("Analyzing content")
analysis = {}
# Basic metrics
text_content = content.get("content", "")
word_count = len(text_content.split())
sentence_count = len(re.split(r'[.!?]+', text_content))
paragraph_count = len(re.split(r'\n\n+', text_content))
analysis["metrics"] = {
"word_count": word_count,
"sentence_count": sentence_count,
"paragraph_count": paragraph_count,
"avg_words_per_sentence": round(word_count / max(sentence_count, 1), 1),
"avg_sentences_per_paragraph": round(sentence_count / max(paragraph_count, 1), 1)
}
# Heading structure analysis
headings = content.get("headings", [])
heading_structure = {}
for h in headings:
level = h["level"]
if level not in heading_structure:
heading_structure[level] = 0
heading_structure[level] += 1
analysis["heading_structure"] = heading_structure
# Content age analysis
publish_date = content.get("publish_date")
if publish_date:
try:
if "T" in publish_date:
pub_date = datetime.fromisoformat(publish_date.replace("Z", "+00:00"))
else:
date_formats = [
"%Y-%m-%d", "%d-%m-%Y", "%B %d, %Y", "%b %d, %Y",
"%d %B %Y", "%d %b %Y", "%Y/%m/%d", "%d/%m/%Y"
]
for fmt in date_formats:
try:
pub_date = datetime.strptime(publish_date, fmt)
break
except ValueError:
continue
now = datetime.now()
age_days = (now - pub_date).days
analysis["content_age"] = {
"days": age_days,
"months": round(age_days / 30, 1),
"years": round(age_days / 365, 1)
}
except Exception as e:
logger.warning(f"Could not parse publish date: {e}")
analysis["content_age"] = {"error": "Could not determine content age"}
else:
analysis["content_age"] = {"error": "No publish date found"}
# Image analysis
images = content.get("images", [])
analysis["images"] = {
"count": len(images),
"with_alt_text": sum(1 for img in images if img.get("alt_text"))
}
return analysis
def conduct_research(self, title: str, content: str, research_depth: str = "medium") -> Dict[str, Any]:
"""
Conduct web research to find updated information related to the blog content.
Args:
title: Blog title
content: Blog content
research_depth: Depth of research (low, medium, high)
Returns:
Dictionary containing research results
"""
logger.info(f"Conducting research with depth: {research_depth}")
# Extract key topics from the content
prompt = f"""
Extract 3-5 key topics or claims from this blog content that might need fact-checking or updating.
For each topic, provide a concise search query that would help find the most recent information.
Blog title: {title}
First 1000 characters of content:
{content[:1000]}...
Format your response as a JSON array of objects with 'topic' and 'query' fields.
"""
try:
topics_json = llm_text_gen(prompt)
topics_json = re.search(r'\[.*\]', topics_json, re.DOTALL)
if topics_json:
topics = json.loads(topics_json.group(0))
else:
topics = [
{"topic": title, "query": title + " latest information"},
{"topic": "Updates on " + title, "query": title + " recent developments"}
]
except Exception as e:
logger.error(f"Error extracting topics: {e}")
topics = [
{"topic": title, "query": title + " latest information"},
{"topic": "Updates on " + title, "query": title + " recent developments"}
]
# Determine number of results based on research depth
num_results = {"low": 2, "medium": 3, "high": 5}.get(research_depth, 3)
research_results = {"topics": []}
# Conduct research for each topic
for topic in topics[:3]: # Limit to 3 topics
topic_results = {"topic": topic["topic"], "sources": []}
# Try Exa search first
try:
exa_results = metaphor_search_articles(topic["query"], num_results=num_results)
if exa_results:
topic_results["sources"].extend(exa_results)
except Exception as e:
logger.warning(f"Exa search failed: {e}")
# If Exa didn't return enough results, try Tavily
if len(topic_results["sources"]) < num_results:
try:
tavily_results = do_tavily_ai_search(topic["query"], num_results=num_results)
if tavily_results:
existing_urls = [s["url"] for s in topic_results["sources"]]
for result in tavily_results:
if result["url"] not in existing_urls:
topic_results["sources"].append(result)
existing_urls.append(result["url"])
except Exception as e:
logger.warning(f"Tavily search failed: {e}")
research_results["topics"].append(topic_results)
return research_results
def generate_rewrite_prompt(self, original_content: Dict[str, Any],
user_preferences: Dict[str, Any],
research_results: Dict[str, Any],
content_analysis: Dict[str, Any]) -> str:
"""
Generate a prompt for the LLM to rewrite the blog.
Args:
original_content: Original blog content
user_preferences: User preferences for rewriting
research_results: Research results for updating content
content_analysis: Analysis of the original content
Returns:
Prompt string for the LLM
"""
logger.info("Generating rewrite prompt")
# Extract key information
title = original_content.get("title", "")
content = original_content.get("content", "")
# Truncate content if it's too long
max_content_length = 6000
if len(content) > max_content_length:
content_preview = content[:max_content_length] + "...\n[Content truncated due to length]"
else:
content_preview = content
# Format research results
research_summary = ""
for topic in research_results.get("topics", []):
research_summary += f"\n## {topic['topic']}\n"
for i, source in enumerate(topic.get("sources", [])[:3]):
research_summary += f"Source {i+1}: {source.get('title', 'Untitled')}\n"
research_summary += f"URL: {source.get('url', 'No URL')}\n"
research_summary += f"Content: {source.get('content', 'No content')[:300]}...\n\n"
# Build the prompt
prompt = f"""
# Blog Rewriting Task
## Original Blog Information
Title: {title}
Word Count: {content_analysis.get('metrics', {}).get('word_count', 'Unknown')}
Estimated Age: {content_analysis.get('content_age', {}).get('months', 'Unknown')} months
## Rewriting Instructions
Mode: {user_preferences.get('rewrite_mode', 'standard')}
Target Tone: {user_preferences.get('tone', 'Professional')}
Target Word Count: {user_preferences.get('target_word_count', 'Same as original')}
Focus Keywords: {', '.join(user_preferences.get('keywords', []))}
## Special Instructions
{user_preferences.get('special_instructions', 'No special instructions')}
## Recent Research Findings
{research_summary if research_summary else "No research results available."}
## Original Content
{content_preview}
## Your Task
Please rewrite this blog post according to the instructions above. The rewritten blog should:
1. Maintain the core message and value of the original content
2. Update any outdated information based on the research findings
3. Adopt the requested tone and style
4. Incorporate the focus keywords naturally
5. Improve readability and engagement
6. Maintain a logical structure with appropriate headings
7. Include a compelling introduction and conclusion
## Output Format
Please provide your response in the following JSON format:
```json
{{
"title": "Rewritten title",
"meta_description": "SEO-optimized meta description (max 160 characters)",
"content": "Full rewritten content with proper markdown formatting",
"suggested_images": [
{{
"description": "Brief description of a suggested image",
"caption": "Suggested caption for the image",
"placement": "Where this image should be placed (e.g., 'After introduction', 'Before conclusion')"
}}
]
}}
```
Ensure the JSON is properly formatted and valid.
"""
return prompt
def rewrite_blog(self, original_content: Dict[str, Any],
user_preferences: Dict[str, Any],
research_results: Dict[str, Any],
content_analysis: Dict[str, Any]) -> Dict[str, Any]:
"""
Rewrite the blog based on original content, user preferences, and research.
Args:
original_content: Original blog content
user_preferences: User preferences for rewriting
research_results: Research results for updating content
content_analysis: Analysis of the original content
Returns:
Dictionary containing rewritten content
"""
logger.info("Rewriting blog content")
# Generate the prompt
prompt = self.generate_rewrite_prompt(
original_content, user_preferences, research_results, content_analysis
)
# Call the LLM to rewrite the content
try:
response = llm_text_gen(prompt)
# Clean the response of any invalid control characters
response = ''.join(char for char in response if ord(char) >= 32 or char in '\n\r\t')
# Extract JSON from the response
json_match = re.search(r'```json\s*(.*?)\s*```', response, re.DOTALL)
if json_match:
json_str = json_match.group(1)
else:
# If no JSON block found, try to find JSON-like content
json_match = re.search(r'\{.*\}', response, re.DOTALL)
if json_match:
json_str = json_match.group(0)
else:
json_str = response
# Clean up the JSON string
json_str = re.sub(r'```(json)?', '', json_str).strip()
# Remove any remaining invalid control characters
json_str = ''.join(char for char in json_str if ord(char) >= 32 or char in '\n\r\t')
# Parse the JSON with error handling
try:
rewritten_content = json.loads(json_str)
except json.JSONDecodeError as e:
logger.error(f"JSON parsing error: {e}")
# Try to fix common JSON issues
json_str = json_str.replace('\\n', '\\\\n') # Fix escaped newlines
json_str = json_str.replace('\\"', '"') # Fix escaped quotes
json_str = json_str.replace('\\t', '\\\\t') # Fix escaped tabs
rewritten_content = json.loads(json_str)
# Validate the response structure
required_fields = ["title", "meta_description", "content"]
for field in required_fields:
if field not in rewritten_content:
rewritten_content[field] = original_content.get(field, "")
logger.warning(f"Missing required field '{field}' in rewritten content")
# Ensure suggested_images exists
if "suggested_images" not in rewritten_content:
rewritten_content["suggested_images"] = []
# Clean up the content field
if "content" in rewritten_content:
# Remove any remaining invalid control characters
rewritten_content["content"] = ''.join(
char for char in rewritten_content["content"]
if ord(char) >= 32 or char in '\n\r\t'
)
# Normalize whitespace
rewritten_content["content"] = re.sub(r'\s+', ' ', rewritten_content["content"])
rewritten_content["content"] = re.sub(r'\n{3,}', '\n\n', rewritten_content["content"])
return rewritten_content
except Exception as e:
logger.error(f"Error rewriting blog: {e}")
return {
"title": original_content.get("title", ""),
"meta_description": original_content.get("meta_description", ""),
"content": original_content.get("content", ""),
"suggested_images": [],
"error": str(e)
}
def generate_image(self, image_prompt: str, style: str = "realistic") -> str:
"""
Generate an image based on the prompt.
Args:
image_prompt: Prompt for image generation
style: Style of the image
Returns:
Path to the generated image
"""
logger.info(f"Generating image with prompt: {image_prompt}")
try:
image_path = generate_image(image_prompt, style=style)
return image_path
except Exception as e:
logger.error(f"Error generating image: {e}")
return ""